Skip to content

Commit

Permalink
feat(KB): chunk catalog api (#39)
Browse files Browse the repository at this point in the history
Because

we are going to provide chunk catalog page

This commit

implemented the chunk related APIs
  • Loading branch information
Yougigun authored Jul 15, 2024
1 parent 51113ce commit 71a3996
Show file tree
Hide file tree
Showing 10 changed files with 1,867 additions and 267 deletions.
2 changes: 1 addition & 1 deletion .env
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# version
GOLANG_VERSION=1.21
GOLANG_VERSION=1.22
K6_VERSION=0.42.0

# service
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -123,3 +123,4 @@ test_.md
test_pdf_base64.txt
.DS_Store
cmd/main/__debug_bin*
.test
4 changes: 1 addition & 3 deletions go.mod
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
module github.com/instill-ai/artifact-backend

go 1.21.1

toolchain go1.21.3
go 1.22

require (
github.com/frankban/quicktest v1.14.6
Expand Down
168 changes: 168 additions & 0 deletions pkg/handler/chunks.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
package handler

import (
"context"
"fmt"

"github.com/google/uuid"
"github.com/instill-ai/artifact-backend/pkg/customerror"
"github.com/instill-ai/artifact-backend/pkg/logger"
"github.com/instill-ai/artifact-backend/pkg/repository"
artifactpb "github.com/instill-ai/protogen-go/artifact/artifact/v1alpha"
"go.uber.org/zap"
"google.golang.org/protobuf/types/known/timestamppb"
)

func (ph *PublicHandler) ListChunks(ctx context.Context, req *artifactpb.ListChunksRequest) (*artifactpb.ListChunksResponse, error) {
log, _ := logger.GetZapLogger(ctx)
uid, err := getUserUIDFromContext(ctx)
if err != nil {
log.Error("failed to get user id from header", zap.Error(err))
return nil, fmt.Errorf("failed to get user id from header: %v. err: %w", err, customerror.ErrUnauthenticated)
}
fileUID, err := uuid.Parse(req.FileUid)
if err != nil {
log.Error("failed to parse file uid", zap.Error(err))
return nil, fmt.Errorf("failed to parse file uid: %v. err: %w", err, customerror.ErrInvalidArgument)
}
// TODO: ACL - check if the user(uid from context) has access to the chunk. get knowledge base id and owner id and check if the user has access to the knowledge base
{
_ = uid
_ = req.OwnerId
_ = req.KbId
}

fileUIDs := []uuid.UUID{fileUID}
files, err := ph.service.Repository.GetKnowledgeBaseFilesByFileUIDs(ctx, fileUIDs)
if err != nil {
log.Error("failed to get knowledge base files by file uids", zap.Error(err))
return nil, fmt.Errorf("failed to get knowledge base files by file uids")
}
if len(files) == 0 {
log.Error("no files found for the given file uids")
return nil, fmt.Errorf("no files found for the given file uids: %v. err: %w", fileUIDs, customerror.ErrNotFound)
}
file := files[0]

sources, err := ph.service.Repository.GetSourceTableAndUIDByFileUIDs(ctx, []repository.KnowledgeBaseFile{file})
if err != nil {
log.Error("failed to get source table and uid by file uids", zap.Error(err))
return nil, fmt.Errorf("failed to get source table and uid by file uids ")
}

source, ok := sources[fileUID]
if !ok {
// source not found. if some files(e.g. pdf) don't have been converted yet. there is not source file for chunks.
return nil, nil
}

chunks, err := ph.service.Repository.GetTextChunksBySource(ctx, source.SourceTable, source.SourceUID)
if err != nil {
log.Error("failed to get text chunks by source", zap.Error(err))
return nil, fmt.Errorf("failed to get text chunks by source: %w ", err)

}

res := make([]*artifactpb.Chunk, 0, len(chunks))
for _, chunk := range chunks {
res = append(res, &artifactpb.Chunk{
ChunkUid: chunk.UID.String(),
Retrievable: chunk.Retrievable,
StartPos: uint32(chunk.StartPos),
EndPos: uint32(chunk.EndPos),
Tokens: uint32(chunk.Tokens),
CreateTime: timestamppb.New(*chunk.CreateTime),
OriginalFileUid: file.UID.String(),
})
}

return &artifactpb.ListChunksResponse{
Chunks: res,
}, nil
}

func (ph *PublicHandler) UpdateChunk(ctx context.Context, req *artifactpb.UpdateChunkRequest) (*artifactpb.UpdateChunkResponse, error) {
log, _ := logger.GetZapLogger(ctx)
uid, err := getUserUIDFromContext(ctx)
if err != nil {
log.Error("failed to get user id from header", zap.Error(err))
return nil, fmt.Errorf("failed to get user id from header: %v. err: %w", err, customerror.ErrUnauthenticated)
}

// TODO: ACL - check if the user(uid from context) has access to the chunk. get chunk's kb uid and check if the uid has access to the knowledge base
{
_ = uid
_ = req.ChunkUid
// use chunk uid to get the knowledge base id
// ....
// check ACL
}

retrievable := req.Retrievable
update := map[string]interface{}{
repository.TextChunkColumn.Retrievable: retrievable,
}

chunk, err := ph.service.Repository.UpdateChunk(ctx, req.ChunkUid, update)
if err != nil {
log.Error("failed to update text chunk", zap.Error(err))
return nil, fmt.Errorf("failed to update text chunk: %w", err)
}

return &artifactpb.UpdateChunkResponse{
// Populate the response fields appropriately
Chunk: &artifactpb.Chunk{
ChunkUid: chunk.UID.String(),
Retrievable: chunk.Retrievable,
StartPos: uint32(chunk.StartPos),
EndPos: uint32(chunk.EndPos),
Tokens: uint32(chunk.Tokens),
CreateTime: timestamppb.New(*chunk.CreateTime),
// OriginalFileUid: chunk.FileUID.String(),
},
}, nil
}

func (ph *PublicHandler) GetSourceFile(ctx context.Context, req *artifactpb.GetSourceFileRequest) (*artifactpb.GetSourceFileResponse, error) {
log, _ := logger.GetZapLogger(ctx)
uid, err := getUserUIDFromContext(ctx)
if err != nil {
log.Error("failed to get user id from header", zap.Error(err))
return nil, fmt.Errorf("failed to get user id from header: %v. err: %w", err, customerror.ErrUnauthenticated)
}
fileUID, err := uuid.Parse(req.FileUid)
if err != nil {
log.Error("failed to parse file uid", zap.Error(err))
return nil, fmt.Errorf("failed to parse file uid: %v. err: %w", err, customerror.ErrInvalidArgument)
}

// TODO ACL - check if the user(uid from context) has access to the source file.
{
_ = uid
_ = req.FileUid
// use file uid to get the knowledge base id
// ....
// check ACL
// ...
}
source, err := ph.service.Repository.GetTruthSourceByFileUID(ctx, fileUID)
if err != nil {
log.Error("failed to get truth source by file uid", zap.Error(err))
return nil, fmt.Errorf("failed to get truth source by file uid. err: %w", err)
}
// get the source file content from minIO using dest of source
content, err := ph.service.MinIO.GetFile(ctx, source.Dest)
if err != nil {
log.Error("failed to get file from minio", zap.Error(err))
return nil, fmt.Errorf("failed to get file from minio. err: %w", err)
}

return &artifactpb.GetSourceFileResponse{
// Populate the response fields appropriately
SourceFile: &artifactpb.SourceFile{
Content: string(content),
CreateTime: timestamppb.New(source.CreateTime),
UpdateTime: timestamppb.New(source.CreateTime),
},
}, nil
}
32 changes: 16 additions & 16 deletions pkg/handler/knowledgebase.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ func (ph *PublicHandler) CreateKnowledgeBase(ctx context.Context, req *artifactp
return nil, err
}

// TODO: check user's permission to create knowledge base in the user or org context
// TODO: ACL check user's permission to create knowledge base in the user or org context
// 1. if it is user namespace, it is okay
// 2. if it is org namespace, check if the user has permission to create knowledge base in the org
// ....
Expand Down Expand Up @@ -175,8 +175,8 @@ func (ph *PublicHandler) ListKnowledgeBases(ctx context.Context, req *artifactpb
DownstreamApps: []string{},
TotalFiles: uint32(fileCounts[kb.UID]),
TotalTokens: uint32(tokenCounts[kb.UID]),
// TODO: get used storage
UsedStorage: 0,
// TODO: get used storage of kb
UsedStorage: 0,
}
}
return &artifactpb.ListKnowledgeBasesResponse{
Expand Down Expand Up @@ -252,7 +252,7 @@ func (ph *PublicHandler) UpdateKnowledgeBase(ctx context.Context, req *artifactp
TotalFiles: uint32(fileCounts[kb.UID]),
TotalTokens: uint32(tokenCounts[kb.UID]),
// TODO: get used storage
UsedStorage: 0,
UsedStorage: 0,
},
}, nil
}
Expand Down Expand Up @@ -284,19 +284,19 @@ func (ph *PublicHandler) DeleteKnowledgeBase(ctx context.Context, req *artifactp

return &artifactpb.DeleteKnowledgeBaseResponse{
KnowledgeBase: &artifactpb.KnowledgeBase{
Name: deletedKb.Name,
KbId: deletedKb.KbID,
Description: deletedKb.Description,
Tags: deletedKb.Tags,
CreateTime: deletedKb.CreateTime.String(),
UpdateTime: deletedKb.UpdateTime.String(),
OwnerName: deletedKb.Owner,
Name: deletedKb.Name,
KbId: deletedKb.KbID,
Description: deletedKb.Description,
Tags: deletedKb.Tags,
CreateTime: deletedKb.CreateTime.String(),
UpdateTime: deletedKb.UpdateTime.String(),
OwnerName: deletedKb.Owner,
ConvertingPipelines: []string{},
EmbeddingPipelines: []string{},
DownstreamApps: []string{},
TotalFiles: 0,
TotalTokens: 0,
UsedStorage: 0,
EmbeddingPipelines: []string{},
DownstreamApps: []string{},
TotalFiles: 0,
TotalTokens: 0,
UsedStorage: 0,
},
}, nil
}
Expand Down
54 changes: 3 additions & 51 deletions pkg/handler/knowledgebasefiles.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package handler

import (
"context"
"errors"
"fmt"
"strings"

Expand All @@ -14,7 +13,6 @@ import (
mgmtpb "github.com/instill-ai/protogen-go/core/mgmt/v1beta"
"go.uber.org/zap"
"google.golang.org/protobuf/types/known/timestamppb"
"gorm.io/gorm"
)

func (ph *PublicHandler) UploadKnowledgeBaseFile(ctx context.Context, req *artifactpb.UploadKnowledgeBaseFileRequest) (*artifactpb.UploadKnowledgeBaseFileResponse, error) {
Expand Down Expand Up @@ -227,18 +225,18 @@ func (ph *PublicHandler) ListKnowledgeBaseFiles(ctx context.Context, req *artifa
return nil, err
}
// get the tokens and chunks using the source table and source uid
sources, err := ph.findSourceTableAndSourceUIDByFileUID(ctx, kbFiles)
sources, err := ph.service.Repository.GetSourceTableAndUIDByFileUIDs(ctx, kbFiles)
if err != nil {
log.Error("failed to find source table and source uid by file uid", zap.Error(err))
return nil, err
}
// TODO need test from file upload to file to embeded text then check the total tokens

totalTokens, err := ph.service.Repository.GetFilesTotalTokens(ctx, sources)
if err != nil {
log.Error("failed to get files total tokens", zap.Error(err))
return nil, err
}
// TODO get total chunks

totalChunks, err := ph.service.Repository.GetTotalChunksBySources(ctx, sources)
if err != nil {
log.Error("failed to get files total chunks", zap.Error(err))
Expand Down Expand Up @@ -272,52 +270,6 @@ func (ph *PublicHandler) ListKnowledgeBaseFiles(ctx context.Context, req *artifa
}, nil
}

// findSourceTableAndSourceUIDByFiles find the source table and source uid by file uid.
func (ph *PublicHandler) findSourceTableAndSourceUIDByFileUID(ctx context.Context, files []repository.KnowledgeBaseFile) (
map[uuid.UUID]struct {
SourceTable string
SourceUID uuid.UUID
}, error) {
result := make(map[uuid.UUID]struct {
SourceTable string
SourceUID uuid.UUID
})
logger, _ := logger.GetZapLogger(ctx)
for _, file := range files {
// find the source table and source uid by file uid
// check if the file is is text or markdown
switch file.Type {
case artifactpb.FileType_FILE_TYPE_TEXT.String(), artifactpb.FileType_FILE_TYPE_MARKDOWN.String():
result[file.UID] = struct {
SourceTable string
SourceUID uuid.UUID
}{
SourceTable: ph.service.Repository.KnowledgeBaseFileTableName(),
SourceUID: file.UID,
}
case artifactpb.FileType_FILE_TYPE_PDF.String():
convertedFile, err := ph.service.Repository.GetConvertedFileByFileUID(ctx, file.UID)
if err != nil {
if errors.Is(err, gorm.ErrRecordNotFound) {
continue
} else {
logger.Error("failed to get converted file by file uid", zap.Error(err))
return nil, err
}
}
result[file.UID] = struct {
SourceTable string
SourceUID uuid.UUID
}{
SourceTable: ph.service.Repository.ConvertedFileTableName(),
SourceUID: convertedFile.UID,
}
}
}

return result, nil
}

func (ph *PublicHandler) DeleteKnowledgeBaseFile(
ctx context.Context,
req *artifactpb.DeleteKnowledgeBaseFileRequest) (
Expand Down
Loading

0 comments on commit 71a3996

Please sign in to comment.