Skip to content

Commit

Permalink
feat(kb): add some kb metadata (#36)
Browse files Browse the repository at this point in the history
Because

knowledgebase metadata needs
1. file #
2. tokens #
3. correct pipeline name
This commit

1. add kb_uid in chunk table and set index on it for count performance.
2. modify pipeline name
  • Loading branch information
Yougigun authored Jul 11, 2024
1 parent 703bb0b commit 0e42ff4
Show file tree
Hide file tree
Showing 14 changed files with 809 additions and 48 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -121,3 +121,4 @@ tmp
test_.pdf
test_.md
test_pdf_base64.txt
.DS_Store
16 changes: 13 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,22 @@ FROM --platform=$TARGETPLATFORM golang:${GOLANG_VERSION} AS build
WORKDIR /src

COPY go.mod go.sum ./
RUN go mod download

RUN --mount=type=cache,target=/go/pkg/mod \
go mod download
COPY . .

ARG SERVICE_NAME TARGETOS TARGETARCH
RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH CGO_ENABLED=0 go build -o /${SERVICE_NAME} ./cmd/main
RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH CGO_ENABLED=0 go build -o /${SERVICE_NAME}-migrate ./cmd/migration

RUN --mount=type=cache,target=/go/pkg/mod \
--mount=type=cache,target=/root/.cache/go-build \
GOOS=$TARGETOS GOARCH=$TARGETARCH CGO_ENABLED=0 go build -o /${SERVICE_NAME} ./cmd/main

# Build the migration tool
RUN --mount=type=cache,target=/go/pkg/mod \
--mount=type=cache,target=/root/.cache/go-build \
GOOS=$TARGETOS GOARCH=$TARGETARCH CGO_ENABLED=0 go build -o /${SERVICE_NAME}-migrate ./cmd/migration


FROM golang:${GOLANG_VERSION}

Expand Down
2 changes: 1 addition & 1 deletion config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ database:
host: pg-sql
port: 5432
name: artifact
version: 8
version: 9
timezone: Etc/UTC
pool:
idleconnections: 5
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ require (
github.com/grpc-ecosystem/go-grpc-middleware v1.3.0
github.com/grpc-ecosystem/grpc-gateway/v2 v2.19.1
github.com/influxdata/influxdb-client-go/v2 v2.12.3
github.com/instill-ai/protogen-go v0.3.3-alpha.0.20240708222431-209c7b3b6ff5
github.com/instill-ai/protogen-go v0.3.3-alpha.0.20240711114323-e16b79bfb545
github.com/instill-ai/usage-client v0.3.0-alpha.0.20240319060111-4a3a39f2fd61
github.com/instill-ai/x v0.3.0-alpha.0.20231219052200-6230a89e386c
github.com/knadh/koanf v1.5.0
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -342,8 +342,8 @@ github.com/influxdata/influxdb-client-go/v2 v2.12.3 h1:28nRlNMRIV4QbtIUvxhWqaxn0
github.com/influxdata/influxdb-client-go/v2 v2.12.3/go.mod h1:IrrLUbCjjfkmRuaCiGQg4m2GbkaeJDcuWoxiWdQEbA0=
github.com/influxdata/line-protocol v0.0.0-20200327222509-2487e7298839 h1:W9WBk7wlPfJLvMCdtV4zPulc4uCPrlywQOmbFOhgQNU=
github.com/influxdata/line-protocol v0.0.0-20200327222509-2487e7298839/go.mod h1:xaLFMmpvUxqXtVkUJfg9QmT88cDaCJ3ZKgdZ78oO8Qo=
github.com/instill-ai/protogen-go v0.3.3-alpha.0.20240708222431-209c7b3b6ff5 h1:CZkUKdzp4TkVbrwov82B341R8Prw2wdmZVzDOl9jVEk=
github.com/instill-ai/protogen-go v0.3.3-alpha.0.20240708222431-209c7b3b6ff5/go.mod h1:2blmpUwiTwxIDnrjIqT6FhR5ewshZZF554wzjXFvKpQ=
github.com/instill-ai/protogen-go v0.3.3-alpha.0.20240711114323-e16b79bfb545 h1:W79+lcBA8DyqGo2gnDSh9pCwyru0oGWEjQxCQf6DqVs=
github.com/instill-ai/protogen-go v0.3.3-alpha.0.20240711114323-e16b79bfb545/go.mod h1:2blmpUwiTwxIDnrjIqT6FhR5ewshZZF554wzjXFvKpQ=
github.com/instill-ai/usage-client v0.3.0-alpha.0.20240319060111-4a3a39f2fd61 h1:smPTvmXDhn/QC7y/TPXyMTqbbRd0gvzmFgWBChwTfhE=
github.com/instill-ai/usage-client v0.3.0-alpha.0.20240319060111-4a3a39f2fd61/go.mod h1:/TAHs4ybuylk5icuy+MQtHRc4XUnIyXzeNKxX9qDFhw=
github.com/instill-ai/x v0.3.0-alpha.0.20231219052200-6230a89e386c h1:a2RVkpIV2QcrGnSHAou+t/L+vBsaIfFvk5inVg5Uh4s=
Expand Down
9 changes: 9 additions & 0 deletions pkg/db/migration/000009_add_kb_uid_in_chunk_table.down.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
BEGIN;

-- Drop the index
DROP INDEX IF EXISTS idx_text_chunk_kb_uid;

-- Remove the column
ALTER TABLE text_chunk DROP COLUMN IF EXISTS kb_uid;

COMMIT;
12 changes: 12 additions & 0 deletions pkg/db/migration/000009_add_kb_uid_in_chunk_table.up.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
BEGIN;

-- Add the new column kb_uid
ALTER TABLE text_chunk ADD COLUMN kb_uid UUID;

-- Add a comment for the new column
COMMENT ON COLUMN text_chunk.kb_uid IS 'Knowledge Base unique identifier';

-- Create an index on the new column
CREATE INDEX idx_text_chunk_kb_uid ON text_chunk (kb_uid);

COMMIT;
Empty file.

This file was deleted.

100 changes: 69 additions & 31 deletions pkg/handler/knowledgebase.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,13 @@ func (ph *PublicHandler) CreateKnowledgeBase(ctx context.Context, req *artifactp
OwnerName: dbData.Owner,
CreateTime: dbData.CreateTime.String(),
UpdateTime: dbData.UpdateTime.String(),
ConvertingPipelines: []string{"leo/fake-pipeline-1", "leo/fake-pipeline-2"},
SplittingPipelines: []string{"leo/fake-pipeline-3", "leo/fake-pipeline-4"},
EmbeddingPipelines: []string{"leo/fake-pipeline-5", "leo/fake-pipeline-6"},
ConvertingPipelines: []string{"preset/indexing-convert-pdf"},
SplittingPipelines: []string{"preset/indexing-split-text", "preset/indexing-split-markdown"},
EmbeddingPipelines: []string{"preset/indexing-embed"},
DownstreamApps: []string{},
TotalFiles: 0,
TotalTokens: 0,
UsedStorage: 0,
},
}, nil
}
Expand Down Expand Up @@ -140,6 +144,21 @@ func (ph *PublicHandler) ListKnowledgeBases(ctx context.Context, req *artifactpb
return nil, fmt.Errorf(ErrorListKnowledgeBasesMsg, err)
}

kbUIDuuid := make([]uuid.UUID, len(dbData))
for i, kb := range dbData {
kbUIDuuid[i] = kb.UID
}

fileCounts, err := ph.service.Repository.GetCountFilesByListKnowledgeBaseUID(ctx, kbUIDuuid)
if err != nil {
log.Error("failed to get file counts", zap.Error(err))
return nil, fmt.Errorf(ErrorListKnowledgeBasesMsg, err)
}
tokenCounts, err := ph.service.Repository.GetTotalTokensByListKBUIDs(ctx, kbUIDuuid)
if err != nil {
log.Error("failed to get token counts", zap.Error(err))
return nil, fmt.Errorf(ErrorListKnowledgeBasesMsg, err)
}
kbs := make([]*artifactpb.KnowledgeBase, len(dbData))
for i, kb := range dbData {
kbs[i] = &artifactpb.KnowledgeBase{
Expand All @@ -150,10 +169,14 @@ func (ph *PublicHandler) ListKnowledgeBases(ctx context.Context, req *artifactpb
CreateTime: kb.CreateTime.String(),
UpdateTime: kb.UpdateTime.String(),
OwnerName: kb.Owner,
ConvertingPipelines: []string{"leo/fake-pipeline-1", "leo/fake-pipeline-2"},
SplittingPipelines: []string{"leo/fake-pipeline-3", "leo/fake-pipeline-4"},
EmbeddingPipelines: []string{"leo/fake-pipeline-5", "leo/fake-pipeline-6"},
// DownstreamApps: []string{"leo/fake-app-1", "leo/fake-app-2"},
ConvertingPipelines: []string{"preset/indexing-convert-pdf"},
SplittingPipelines: []string{"preset/indexing-split-text", "preset/indexing-split-markdown"},
EmbeddingPipelines: []string{"preset/indexing-embed"},
DownstreamApps: []string{},
TotalFiles: uint32(fileCounts[kb.UID]),
TotalTokens: uint32(tokenCounts[kb.UID]),
// TODO: get used storage
UsedStorage: 0,
}
}
return &artifactpb.ListKnowledgeBasesResponse{
Expand Down Expand Up @@ -187,7 +210,7 @@ func (ph *PublicHandler) UpdateKnowledgeBase(ctx context.Context, req *artifactp
// TODO: ACL - check user's permission to update knowledge base
_ = authUID
// check if knowledge base exists
dbData, err := ph.service.Repository.UpdateKnowledgeBase(
kb, err := ph.service.Repository.UpdateKnowledgeBase(
ctx,
ownerUUID,
repository.KnowledgeBase{
Expand All @@ -202,20 +225,34 @@ func (ph *PublicHandler) UpdateKnowledgeBase(ctx context.Context, req *artifactp
log.Error("failed to update knowledge base", zap.Error(err))
return nil, err
}
fileCounts, err := ph.service.Repository.GetCountFilesByListKnowledgeBaseUID(ctx, []uuid.UUID{kb.UID})
if err != nil {
log.Error("failed to get file counts", zap.Error(err))
return nil, fmt.Errorf(ErrorListKnowledgeBasesMsg, err)
}
tokenCounts, err := ph.service.Repository.GetTotalTokensByListKBUIDs(ctx, []uuid.UUID{kb.UID})
if err != nil {
log.Error("failed to get token counts", zap.Error(err))
return nil, fmt.Errorf(ErrorListKnowledgeBasesMsg, err)
}
// populate response
return &artifactpb.UpdateKnowledgeBaseResponse{
KnowledgeBase: &artifactpb.KnowledgeBase{
Name: dbData.Name,
KbId: dbData.KbID,
Description: dbData.Description,
Tags: dbData.Tags,
CreateTime: dbData.CreateTime.String(),
UpdateTime: dbData.UpdateTime.String(),
OwnerName: dbData.Owner,
ConvertingPipelines: []string{"leo/fake-pipeline-1", "leo/fake-pipeline-2"},
SplittingPipelines: []string{"leo/fake-pipeline-3", "leo/fake-pipeline-4"},
EmbeddingPipelines: []string{"leo/fake-pipeline-5", "leo/fake-pipeline-6"},
// DownstreamApps: []string{"leo/fake-app-1", "leo/fake-app-2"},
Name: kb.Name,
KbId: kb.KbID,
Description: kb.Description,
Tags: kb.Tags,
CreateTime: kb.CreateTime.String(),
UpdateTime: kb.UpdateTime.String(),
OwnerName: kb.Owner,
ConvertingPipelines: []string{"preset/indexing-convert-pdf"},
SplittingPipelines: []string{"preset/indexing-split-text", "preset/indexing-split-markdown"},
EmbeddingPipelines: []string{"preset/indexing-embed"},
DownstreamApps: []string{},
TotalFiles: uint32(fileCounts[kb.UID]),
TotalTokens: uint32(tokenCounts[kb.UID]),
// TODO: get used storage
UsedStorage: 0,
},
}, nil
}
Expand Down Expand Up @@ -244,21 +281,22 @@ func (ph *PublicHandler) DeleteKnowledgeBase(ctx context.Context, req *artifactp

return nil, err
}
// populate response

return &artifactpb.DeleteKnowledgeBaseResponse{
KnowledgeBase: &artifactpb.KnowledgeBase{
Name: deletedKb.Name,
KbId: deletedKb.KbID,
Description: deletedKb.Description,
Tags: deletedKb.Tags,
CreateTime: deletedKb.CreateTime.String(),
UpdateTime: deletedKb.UpdateTime.String(),
OwnerName: deletedKb.Owner,
ConvertingPipelines: []string{"leo/fake-pipeline-1", "leo/fake-pipeline-2"},
SplittingPipelines: []string{"leo/fake-pipeline-3", "leo/fake-pipeline-4"},
EmbeddingPipelines: []string{"leo/fake-pipeline-5", "leo/fake-pipeline-6"},
// DownstreamApps: []string{"leo/fake-app-1", "leo/fake-app-2"}
Name: deletedKb.Name,
KbId: deletedKb.KbID,
Description: deletedKb.Description,
Tags: deletedKb.Tags,
CreateTime: deletedKb.CreateTime.String(),
UpdateTime: deletedKb.UpdateTime.String(),
OwnerName: deletedKb.Owner,
ConvertingPipelines: []string{},
EmbeddingPipelines: []string{},
DownstreamApps: []string{},
TotalFiles: 0,
TotalTokens: 0,
UsedStorage: 0,
},
}, nil
}
Expand Down
Loading

0 comments on commit 0e42ff4

Please sign in to comment.