From 039ef87199757d66e320909a9a3676c2f252cab1 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Wed, 13 Dec 2023 03:56:08 +0100 Subject: [PATCH] feat: Add support for pgvector's `vector` data type --- .github/workflows/ci_workflow.yml | 2 +- README.md | 25 +++++++- docker-compose.yml | 12 +++- poetry.lock | 60 +++++++++++++++++-- pyproject.toml | 4 ++ target_postgres/connector.py | 43 +++++++++++++ .../data_files/array_float_vector.singer | 5 ++ target_postgres/tests/init.sql | 1 + target_postgres/tests/test_target_postgres.py | 18 ++++++ tox.ini | 8 +-- 10 files changed, 163 insertions(+), 15 deletions(-) create mode 100644 target_postgres/tests/data_files/array_float_vector.singer create mode 100644 target_postgres/tests/init.sql diff --git a/.github/workflows/ci_workflow.yml b/.github/workflows/ci_workflow.yml index 9ca716d4..7fa2f3c2 100644 --- a/.github/workflows/ci_workflow.yml +++ b/.github/workflows/ci_workflow.yml @@ -36,7 +36,7 @@ jobs: pipx install poetry - name: Install dependencies run: | - poetry install + poetry install --all-extras - name: Run pytest run: | poetry run pytest --capture=no diff --git a/README.md b/README.md index a1b48c33..4ba4adeb 100644 --- a/README.md +++ b/README.md @@ -102,7 +102,7 @@ tap-carbon-intensity | target-postgres --config /path/to/target-postgres-config. ```bash pipx install poetry -poetry install +poetry install --all-extras pipx install pre-commit pre-commit install ``` @@ -152,6 +152,8 @@ develop your own Singer taps and targets. ## Data Types +### Mapping + The below table shows how this tap will map between jsonschema datatypes and Postgres datatypes. | jsonschema | Postgres | @@ -202,7 +204,20 @@ The below table shows how this tap will map between jsonschema datatypes and Pos Note that while object types are mapped directly to jsonb, array types are mapped to a jsonb array. -If a column has multiple jsonschema types, the following order is using to order Postgres types, from highest priority to lowest priority. +When using [pgvector], this type mapping applies, additionally to the table above. + +| jsonschema | Postgres | +|------------------------------------------------|----------| +| array (with additional SCHEMA annotations [1]) | vector | + +[1] `"storage": {"type": "vector", "dim": 4}` + +### Resolution Order + +If a column has multiple jsonschema types, there is a priority list for +resolving the best type candidate, from the highest priority to the +lowest priority. + - ARRAY(JSONB) - JSONB - TEXT @@ -215,3 +230,9 @@ If a column has multiple jsonschema types, the following order is using to order - INTEGER - BOOLEAN - NOTYPE + +When using [pgvector], the `pgvector.sqlalchemy.Vector` type is added to the bottom +of the list. + + +[pgvector]: https://github.com/pgvector/pgvector diff --git a/docker-compose.yml b/docker-compose.yml index f2d453c4..a187f7c9 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -3,7 +3,7 @@ version: "2.1" services: postgres: - image: docker.io/postgres:latest + image: ankane/pgvector:latest command: postgres -c ssl=on -c ssl_cert_file=/var/lib/postgresql/server.crt -c ssl_key_file=/var/lib/postgresql/server.key -c ssl_ca_file=/var/lib/postgresql/ca.crt -c hba_file=/var/lib/postgresql/pg_hba.conf environment: POSTGRES_USER: postgres @@ -13,6 +13,7 @@ services: POSTGRES_INITDB_ARGS: --auth-host=cert # Not placed in the data directory (/var/lib/postgresql/data) because of https://gist.github.com/mrw34/c97bb03ea1054afb551886ffc8b63c3b?permalink_comment_id=2678568#gistcomment-2678568 volumes: + - ./target_postgres/tests/init.sql:/docker-entrypoint-initdb.d/init.sql - ./ssl/server.crt:/var/lib/postgresql/server.crt # Certificate verifying the server's identity to the client. - ./ssl/server.key:/var/lib/postgresql/server.key # Private key to verify the server's certificate is legitimate. - ./ssl/ca.crt:/var/lib/postgresql/ca.crt # Certificate authority to use when verifying the client's identity to the server. @@ -20,9 +21,11 @@ services: ports: - "5432:5432" postgres_no_ssl: # Borrowed from https://github.com/MeltanoLabs/tap-postgres/blob/main/.github/workflows/test.yml#L13-L23 - image: docker.io/postgres:latest + image: ankane/pgvector:latest environment: POSTGRES_PASSWORD: postgres + volumes: + - ./target_postgres/tests/init.sql:/docker-entrypoint-initdb.d/init.sql ports: - 5433:5432 ssh: @@ -37,17 +40,20 @@ services: - PASSWORD_ACCESS=false - USER_NAME=melty volumes: + - ./target_postgres/tests/init.sql:/docker-entrypoint-initdb.d/init.sql - ./ssh_tunnel/ssh-server-config:/config/ssh_host_keys:ro ports: - "127.0.0.1:2223:2222" networks: - inner postgresdb: - image: postgres:13.0 + image: ankane/pgvector:latest environment: POSTGRES_USER: postgres POSTGRES_PASSWORD: postgres POSTGRES_DB: main + volumes: + - ./target_postgres/tests/init.sql:/docker-entrypoint-initdb.d/init.sql networks: inner: ipv4_address: 10.5.0.5 diff --git a/poetry.lock b/poetry.lock index e56bfda3..56854d0f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. [[package]] name = "appdirs" @@ -736,6 +736,43 @@ files = [ {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, ] +[[package]] +name = "numpy" +version = "1.24.4" +description = "Fundamental package for array computing in Python" +optional = true +python-versions = ">=3.8" +files = [ + {file = "numpy-1.24.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c0bfb52d2169d58c1cdb8cc1f16989101639b34c7d3ce60ed70b19c63eba0b64"}, + {file = "numpy-1.24.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ed094d4f0c177b1b8e7aa9cba7d6ceed51c0e569a5318ac0ca9a090680a6a1b1"}, + {file = "numpy-1.24.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79fc682a374c4a8ed08b331bef9c5f582585d1048fa6d80bc6c35bc384eee9b4"}, + {file = "numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ffe43c74893dbf38c2b0a1f5428760a1a9c98285553c89e12d70a96a7f3a4d6"}, + {file = "numpy-1.24.4-cp310-cp310-win32.whl", hash = "sha256:4c21decb6ea94057331e111a5bed9a79d335658c27ce2adb580fb4d54f2ad9bc"}, + {file = "numpy-1.24.4-cp310-cp310-win_amd64.whl", hash = "sha256:b4bea75e47d9586d31e892a7401f76e909712a0fd510f58f5337bea9572c571e"}, + {file = "numpy-1.24.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f136bab9c2cfd8da131132c2cf6cc27331dd6fae65f95f69dcd4ae3c3639c810"}, + {file = "numpy-1.24.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e2926dac25b313635e4d6cf4dc4e51c8c0ebfed60b801c799ffc4c32bf3d1254"}, + {file = "numpy-1.24.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:222e40d0e2548690405b0b3c7b21d1169117391c2e82c378467ef9ab4c8f0da7"}, + {file = "numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7215847ce88a85ce39baf9e89070cb860c98fdddacbaa6c0da3ffb31b3350bd5"}, + {file = "numpy-1.24.4-cp311-cp311-win32.whl", hash = "sha256:4979217d7de511a8d57f4b4b5b2b965f707768440c17cb70fbf254c4b225238d"}, + {file = "numpy-1.24.4-cp311-cp311-win_amd64.whl", hash = "sha256:b7b1fc9864d7d39e28f41d089bfd6353cb5f27ecd9905348c24187a768c79694"}, + {file = "numpy-1.24.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1452241c290f3e2a312c137a9999cdbf63f78864d63c79039bda65ee86943f61"}, + {file = "numpy-1.24.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:04640dab83f7c6c85abf9cd729c5b65f1ebd0ccf9de90b270cd61935eef0197f"}, + {file = "numpy-1.24.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5425b114831d1e77e4b5d812b69d11d962e104095a5b9c3b641a218abcc050e"}, + {file = "numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd80e219fd4c71fc3699fc1dadac5dcf4fd882bfc6f7ec53d30fa197b8ee22dc"}, + {file = "numpy-1.24.4-cp38-cp38-win32.whl", hash = "sha256:4602244f345453db537be5314d3983dbf5834a9701b7723ec28923e2889e0bb2"}, + {file = "numpy-1.24.4-cp38-cp38-win_amd64.whl", hash = "sha256:692f2e0f55794943c5bfff12b3f56f99af76f902fc47487bdfe97856de51a706"}, + {file = "numpy-1.24.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2541312fbf09977f3b3ad449c4e5f4bb55d0dbf79226d7724211acc905049400"}, + {file = "numpy-1.24.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9667575fb6d13c95f1b36aca12c5ee3356bf001b714fc354eb5465ce1609e62f"}, + {file = "numpy-1.24.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3a86ed21e4f87050382c7bc96571755193c4c1392490744ac73d660e8f564a9"}, + {file = "numpy-1.24.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d11efb4dbecbdf22508d55e48d9c8384db795e1b7b51ea735289ff96613ff74d"}, + {file = "numpy-1.24.4-cp39-cp39-win32.whl", hash = "sha256:6620c0acd41dbcb368610bb2f4d83145674040025e5536954782467100aa8835"}, + {file = "numpy-1.24.4-cp39-cp39-win_amd64.whl", hash = "sha256:befe2bf740fd8373cf56149a5c23a0f601e82869598d41f8e188a0e9869926f8"}, + {file = "numpy-1.24.4-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:31f13e25b4e304632a4619d0e0777662c2ffea99fcae2029556b17d8ff958aef"}, + {file = "numpy-1.24.4-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95f7ac6540e95bc440ad77f56e520da5bf877f87dca58bd095288dce8940532a"}, + {file = "numpy-1.24.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:e98f220aa76ca2a977fe435f5b04d7b3470c0a2e6312907b37ba6068f26787f2"}, + {file = "numpy-1.24.4.tar.gz", hash = "sha256:80f5e3a4e498641401868df4208b74581206afbee7cf7b8329daae82676d9463"}, +] + [[package]] name = "packaging" version = "23.2" @@ -813,6 +850,19 @@ files = [ python-dateutil = ">=2.6,<3.0" pytzdata = ">=2020.1" +[[package]] +name = "pgvector" +version = "0.2.4" +description = "pgvector support for Python" +optional = true +python-versions = ">=3.8" +files = [ + {file = "pgvector-0.2.4-py2.py3-none-any.whl", hash = "sha256:548e1f88d3c7433020c1c177feddad2f36915c262852d621f9018fcafff6870b"}, +] + +[package.dependencies] +numpy = "*" + [[package]] name = "pkgutil-resolve-name" version = "1.3.10" @@ -898,7 +948,6 @@ files = [ {file = "psycopg2_binary-2.9.9-cp311-cp311-win32.whl", hash = "sha256:dc4926288b2a3e9fd7b50dc6a1909a13bbdadfc67d93f3374d984e56f885579d"}, {file = "psycopg2_binary-2.9.9-cp311-cp311-win_amd64.whl", hash = "sha256:b76bedd166805480ab069612119ea636f5ab8f8771e640ae103e05a4aae3e417"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:8532fd6e6e2dc57bcb3bc90b079c60de896d2128c5d9d6f24a63875a95a088cf"}, - {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b0605eaed3eb239e87df0d5e3c6489daae3f7388d455d0c0b4df899519c6a38d"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f8544b092a29a6ddd72f3556a9fcf249ec412e10ad28be6a0c0d948924f2212"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2d423c8d8a3c82d08fe8af900ad5b613ce3632a1249fd6a223941d0735fce493"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e5afae772c00980525f6d6ecf7cbca55676296b580c0e6abb407f15f3706996"}, @@ -907,8 +956,6 @@ files = [ {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:cb16c65dcb648d0a43a2521f2f0a2300f40639f6f8c1ecbc662141e4e3e1ee07"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:911dda9c487075abd54e644ccdf5e5c16773470a6a5d3826fda76699410066fb"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:57fede879f08d23c85140a360c6a77709113efd1c993923c59fde17aa27599fe"}, - {file = "psycopg2_binary-2.9.9-cp312-cp312-win32.whl", hash = "sha256:64cf30263844fa208851ebb13b0732ce674d8ec6a0c86a4e160495d299ba3c93"}, - {file = "psycopg2_binary-2.9.9-cp312-cp312-win_amd64.whl", hash = "sha256:81ff62668af011f9a48787564ab7eded4e9fb17a4a6a74af5ffa6a457400d2ab"}, {file = "psycopg2_binary-2.9.9-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2293b001e319ab0d869d660a704942c9e2cce19745262a8aba2115ef41a0a42a"}, {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:03ef7df18daf2c4c07e2695e8cfd5ee7f748a1d54d802330985a78d2a5a6dca9"}, {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a602ea5aff39bb9fac6308e9c9d82b9a35c2bf288e184a816002c9fae930b77"}, @@ -1767,7 +1814,10 @@ files = [ docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-lint"] testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy (>=0.9.1)", "pytest-ruff"] +[extras] +pgvector = ["pgvector"] + [metadata] lock-version = "2.0" python-versions = "<3.12,>=3.8.1" -content-hash = "aee5c5d02d31c400a67fb99db6497d4696b1c5de20c00fc20e1f1d13abae1319" +content-hash = "8cefd8ad15e8f50b0f841f7ade76d529a2f112ff959109b2deb79e58db8fcc49" diff --git a/pyproject.toml b/pyproject.toml index 6c1d4e4b..622b034c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,7 @@ packages = [ python = "<3.12,>=3.8.1" requests = "^2.25.1" singer-sdk = ">=0.28,<0.34" +pgvector = { version="^0.2.4", optional = true } psycopg2-binary = "2.9.9" sqlalchemy = ">=2.0,<3.0" sshtunnel = "0.4.0" @@ -50,6 +51,9 @@ types-simplejson = "^3.19.0.2" types-sqlalchemy = "^1.4.53.38" types-jsonschema = "^4.19.0.3" +[tool.poetry.extras] +pgvector = ["pgvector"] + [tool.mypy] exclude = "tests" diff --git a/target_postgres/connector.py b/target_postgres/connector.py index 09554d59..cede77e9 100644 --- a/target_postgres/connector.py +++ b/target_postgres/connector.py @@ -277,6 +277,42 @@ def pick_individual_type(jsonschema_type: dict): if "object" in jsonschema_type["type"]: return JSONB() if "array" in jsonschema_type["type"]: + # This currently uses a non-conformant definition for the Singer SCHEMA, + # using the `additionalProperties` attribute to convey additional type + # information, agnostic of the target database. + """ + Schema override rule in `meltano.yml`: + + type: "array" + items: + type: "number" + additionalProperties: + storage: + type: "vector" + dim: 4 + + Produced schema annotation in `catalog.json`: + + {"type": "array", + "items": {"type": "number"}, + "additionalProperties": {"storage": {"type": "vector", "dim": 4}}} + """ + if ( + "additionalProperties" in jsonschema_type + and "storage" in jsonschema_type["additionalProperties"] + ): + storage_properties = jsonschema_type["additionalProperties"]["storage"] + if ( + "type" in storage_properties + and storage_properties["type"] == "vector" + ): + # On PostgreSQL/pgvector, use the corresponding type definition + # from its SQLAlchemy dialect. + from pgvector.sqlalchemy import ( + Vector, # type: ignore[import-untyped] + ) + + return Vector(storage_properties["dim"]) return ARRAY(JSONB()) if jsonschema_type.get("format") == "date-time": return TIMESTAMP() @@ -310,6 +346,13 @@ def pick_best_sql_type(sql_type_array: list): NOTYPE, ] + try: + from pgvector.sqlalchemy import Vector + + precedence_order.append(Vector) + except ImportError: + pass + for sql_type in precedence_order: for obj in sql_type_array: if isinstance(obj, sql_type): diff --git a/target_postgres/tests/data_files/array_float_vector.singer b/target_postgres/tests/data_files/array_float_vector.singer new file mode 100644 index 00000000..d87ae3b8 --- /dev/null +++ b/target_postgres/tests/data_files/array_float_vector.singer @@ -0,0 +1,5 @@ +{"type": "SCHEMA", "stream": "array_float_vector", "key_properties": ["id"], "schema": {"required": ["id"], "type": "object", "properties": {"id": {"type": "integer"}, "value": {"type": "array", "items": {"type": "number"}, "storage": {"type": "vector", "dim": 4}}}}} +{"type": "RECORD", "stream": "array_float_vector", "record": {"id": 1, "value": [ 1.1, 2.1, 1.1, 1.3 ]}} +{"type": "RECORD", "stream": "array_float_vector", "record": {"id": 2, "value": [ 1.0, 1.0, 1.0, 2.3 ]}} +{"type": "RECORD", "stream": "array_float_vector", "record": {"id": 3, "value": [ 2.0, 1.2, 1.0, 0.9 ]}} +{"type": "STATE", "value": {"array_float_vector": 3}} diff --git a/target_postgres/tests/init.sql b/target_postgres/tests/init.sql new file mode 100644 index 00000000..0aa0fc22 --- /dev/null +++ b/target_postgres/tests/init.sql @@ -0,0 +1 @@ +CREATE EXTENSION IF NOT EXISTS vector; diff --git a/target_postgres/tests/test_target_postgres.py b/target_postgres/tests/test_target_postgres.py index ab4cd11d..051e4b5f 100644 --- a/target_postgres/tests/test_target_postgres.py +++ b/target_postgres/tests/test_target_postgres.py @@ -458,6 +458,24 @@ def test_array_boolean(postgres_target): ) +def test_array_float_vector(postgres_target): + file_name = "array_float_vector.singer" + singer_file_to_target(file_name, postgres_target) + row = { + "id": 1, + "value": [Decimal("1.1"), Decimal("2.1"), Decimal("1.1"), Decimal("1.3")], + } + verify_data(postgres_target, "array_float_vector", 3, "id", row) + verify_schema( + postgres_target, + "array_float_vector", + check_columns={ + "id": {"type": BIGINT}, + "value": {"type": ARRAY}, + }, + ) + + def test_array_number(postgres_target): file_name = "array_number.singer" singer_file_to_target(file_name, postgres_target) diff --git a/tox.ini b/tox.ini index 0c287e8f..85c03b5a 100644 --- a/tox.ini +++ b/tox.ini @@ -9,7 +9,7 @@ isolated_build = true allowlist_externals = poetry commands = - poetry install -v + poetry install --all-extras -v poetry run pytest poetry run black --check target_postgres/ poetry run flake8 target_postgres @@ -21,14 +21,14 @@ commands = # To execute, run `tox -e pytest` envlist = py37, py38, py39 commands = - poetry install -v + poetry install --all-extras -v poetry run pytest [testenv:format] # Attempt to auto-resolve lint errors before they are raised. # To execute, run `tox -e format` commands = - poetry install -v + poetry install --all-extras -v poetry run black target_postgres/ poetry run isort target_postgres @@ -36,7 +36,7 @@ commands = # Raise an error if lint and style standards are not met. # To execute, run `tox -e lint` commands = - poetry install -v + poetry install --all-extras -v poetry run black --check --diff target_postgres/ poetry run isort --check target_postgres poetry run flake8 target_postgres