Skip to content

Commit

Permalink
feat: Add support for pgvector's vector data type
Browse files Browse the repository at this point in the history
  • Loading branch information
amotl committed Dec 13, 2023
1 parent af77af4 commit 312395d
Show file tree
Hide file tree
Showing 7 changed files with 111 additions and 9 deletions.
12 changes: 9 additions & 3 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
version: "2.1"
services:
postgres:
image: docker.io/postgres:latest
image: ankane/pgvector:latest
command: postgres -c ssl=on -c ssl_cert_file=/var/lib/postgresql/server.crt -c ssl_key_file=/var/lib/postgresql/server.key -c ssl_ca_file=/var/lib/postgresql/ca.crt -c hba_file=/var/lib/postgresql/pg_hba.conf
environment:
POSTGRES_USER: postgres
Expand All @@ -13,16 +13,19 @@ services:
POSTGRES_INITDB_ARGS: --auth-host=cert
# Not placed in the data directory (/var/lib/postgresql/data) because of https://gist.github.com/mrw34/c97bb03ea1054afb551886ffc8b63c3b?permalink_comment_id=2678568#gistcomment-2678568
volumes:
- ./target_postgres/tests/init.sql:/docker-entrypoint-initdb.d/init.sql
- ./ssl/server.crt:/var/lib/postgresql/server.crt # Certificate verifying the server's identity to the client.
- ./ssl/server.key:/var/lib/postgresql/server.key # Private key to verify the server's certificate is legitimate.
- ./ssl/ca.crt:/var/lib/postgresql/ca.crt # Certificate authority to use when verifying the client's identity to the server.
- ./ssl/pg_hba.conf:/var/lib/postgresql/pg_hba.conf # Configuration file to allow connection over SSL.
ports:
- "5432:5432"
postgres_no_ssl: # Borrowed from https://github.com/MeltanoLabs/tap-postgres/blob/main/.github/workflows/test.yml#L13-L23
image: docker.io/postgres:latest
image: ankane/pgvector:latest
environment:
POSTGRES_PASSWORD: postgres
volumes:
- ./target_postgres/tests/init.sql:/docker-entrypoint-initdb.d/init.sql
ports:
- 5433:5432
ssh:
Expand All @@ -37,17 +40,20 @@ services:
- PASSWORD_ACCESS=false
- USER_NAME=melty
volumes:
- ./target_postgres/tests/init.sql:/docker-entrypoint-initdb.d/init.sql
- ./ssh_tunnel/ssh-server-config:/config/ssh_host_keys:ro
ports:
- "127.0.0.1:2223:2222"
networks:
- inner
postgresdb:
image: postgres:13.0
image: ankane/pgvector:latest
environment:
POSTGRES_USER: postgres
POSTGRES_PASSWORD: postgres
POSTGRES_DB: main
volumes:
- ./target_postgres/tests/init.sql:/docker-entrypoint-initdb.d/init.sql
networks:
inner:
ipv4_address: 10.5.0.5
Expand Down
60 changes: 55 additions & 5 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ packages = [
python = "<3.12,>=3.8.1"
requests = "^2.25.1"
singer-sdk = ">=0.28,<0.34"
pgvector = { version="^0.2.4", optional = true }
psycopg2-binary = "2.9.9"
sqlalchemy = ">=2.0,<3.0"
sshtunnel = "0.4.0"
Expand All @@ -50,6 +51,9 @@ types-simplejson = "^3.19.0.2"
types-sqlalchemy = "^1.4.53.38"
types-jsonschema = "^4.19.0.3"

[tool.poetry.extras]
pgvector = ["pgvector"]

[tool.mypy]
exclude = "tests"

Expand Down
19 changes: 18 additions & 1 deletion target_postgres/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def __init__(self, config: dict) -> None:
self.ssh_tunnel = SSHTunnelForwarder(
ssh_address_or_host=(ssh_config["host"], ssh_config["port"]),
ssh_username=ssh_config["username"],
ssh_private_key=self.guess_key_type(ssh_config["private_key"]),
ssh_pkey=self.guess_key_type(ssh_config["private_key"]),
ssh_private_key_password=ssh_config.get("private_key_password"),
remote_bind_address=(url.host, url.port),
)
Expand Down Expand Up @@ -277,6 +277,16 @@ def pick_individual_type(jsonschema_type: dict):
if "object" in jsonschema_type["type"]:
return JSONB()
if "array" in jsonschema_type["type"]:
# FIXME: This currently uses a non-conformant definition for the Singer SCHEMA.
# {"type": "array", "items": {"type": "number"}, "storage": {"type": "vector", "dim": 4}}
if (
"storage" in jsonschema_type
and "type" in jsonschema_type["storage"]
and jsonschema_type["storage"]["type"] == "vector"
):
from pgvector.sqlalchemy import Vector

return Vector(jsonschema_type["storage"]["dim"])
return ARRAY(JSONB())
if jsonschema_type.get("format") == "date-time":
return TIMESTAMP()
Expand Down Expand Up @@ -310,6 +320,13 @@ def pick_best_sql_type(sql_type_array: list):
NOTYPE,
]

try:
from pgvector.sqlalchemy import Vector

precedence_order.append(Vector)
except ImportError:
pass

for sql_type in precedence_order:
for obj in sql_type_array:
if isinstance(obj, sql_type):
Expand Down
5 changes: 5 additions & 0 deletions target_postgres/tests/data_files/array_float_vector.singer
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{"type": "SCHEMA", "stream": "array_float_vector", "key_properties": ["id"], "schema": {"required": ["id"], "type": "object", "properties": {"id": {"type": "integer"}, "value": {"type": "array", "items": {"type": "number"}, "storage": {"type": "vector", "dim": 4}}}}}
{"type": "RECORD", "stream": "array_float_vector", "record": {"id": 1, "value": [ 1.1, 2.1, 1.1, 1.3 ]}}
{"type": "RECORD", "stream": "array_float_vector", "record": {"id": 2, "value": [ 1.0, 1.0, 1.0, 2.3 ]}}
{"type": "RECORD", "stream": "array_float_vector", "record": {"id": 3, "value": [ 2.0, 1.2, 1.0, 0.9 ]}}
{"type": "STATE", "value": {"array_float_vector": 3}}
1 change: 1 addition & 0 deletions target_postgres/tests/init.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
CREATE EXTENSION IF NOT EXISTS vector;
19 changes: 19 additions & 0 deletions target_postgres/tests/test_target_postgres.py
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,25 @@ def test_array_boolean(postgres_target):
)


def test_array_float_vector(postgres_target):
pgvector_sa = pytest.importorskip("pgvector.sqlalchemy")
file_name = "array_float_vector.singer"
singer_file_to_target(file_name, postgres_target)
row = {
"id": 1,
"value": "[1.1,2.1,1.1,1.3]",
}
verify_data(postgres_target, "array_float_vector", 3, "id", row)
verify_schema(
postgres_target,
"array_float_vector",
check_columns={
"id": {"type": BIGINT},
"value": {"type": pgvector_sa.Vector},
},
)


def test_array_number(postgres_target):
file_name = "array_number.singer"
singer_file_to_target(file_name, postgres_target)
Expand Down

0 comments on commit 312395d

Please sign in to comment.