Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(artifact): update htlm file process #129

Merged
merged 1 commit into from
Nov 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 12 additions & 4 deletions pkg/service/pipeline.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ const QAPipelineID = "retrieving-qna"
const QAVersion = "v1.2.0"

// ConvertToMDPipe using converting pipeline to convert some file type to MD and consume caller's credits
func (s *Service) ConvertToMDPipe(ctx context.Context, caller uuid.UUID, requester uuid.UUID, fileBase64 string, fileType artifactPb.FileType) (string, error) {
func (s *Service) ConvertToMDPipe(ctx context.Context, fileUID uuid.UUID, caller uuid.UUID, requester uuid.UUID, fileBase64 string, fileType artifactPb.FileType) (string, error) {
logger, _ := logger.GetZapLogger(ctx)
var md metadata.MD
if requester != uuid.Nil {
Expand Down Expand Up @@ -71,22 +71,30 @@ func (s *Service) ConvertToMDPipe(ctx context.Context, caller uuid.UUID, request
artifactPb.FileType_FILE_TYPE_DOCX,
artifactPb.FileType_FILE_TYPE_DOC,
artifactPb.FileType_FILE_TYPE_PPT,
artifactPb.FileType_FILE_TYPE_PPTX,
artifactPb.FileType_FILE_TYPE_HTML:
artifactPb.FileType_FILE_TYPE_PPTX:
pipelineID = ConvertDocToMDPipelineID2
version = DocToMDVersion2

// Spreadsheet types and others use the original pipeline
case artifactPb.FileType_FILE_TYPE_XLSX,
artifactPb.FileType_FILE_TYPE_XLS,
artifactPb.FileType_FILE_TYPE_CSV:
artifactPb.FileType_FILE_TYPE_CSV,
artifactPb.FileType_FILE_TYPE_HTML:
pipelineID = ConvertDocToMDPipelineID
version = DocToMDVersion

default:
return "", fmt.Errorf("unsupported file type: %v", fileType)
}

// save the converting pipeline metadata into database
convertingPipelineMetadata := NamespaceID + "/" + pipelineID + "@" + version
err := s.Repository.UpdateKbFileExtraMetaData(ctx, fileUID, "", convertingPipelineMetadata, "", "", nil, nil, nil, nil)
if err != nil {
logger.Error("Failed to save converting pipeline metadata.", zap.String("File uid:", fileUID.String()))
return "", fmt.Errorf("failed to save converting pipeline metadata: %w", err)
}

req := &pipelinePb.TriggerNamespacePipelineReleaseRequest{
NamespaceId: NamespaceID,
PipelineId: pipelineID,
Expand Down
18 changes: 4 additions & 14 deletions pkg/worker/worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -453,19 +453,9 @@ func (wp *fileToEmbWorkerPool) processConvertingFile(ctx context.Context, file r
// encode data to base64
base64Data := base64.StdEncoding.EncodeToString(data)

// save the converting pipeline metadata into database
convertingPipelineMetadata := service.NamespaceID + "/" + service.ConvertDocToMDPipelineID + "@" + service.DocToMDVersion
err = wp.svc.Repository.UpdateKbFileExtraMetaData(ctx, file.UID, "", convertingPipelineMetadata, "", "", nil, nil, nil, nil)
if err != nil {
logger.Error("Failed to save converting pipeline metadata.", zap.String("File uid:", file.UID.String()))
return nil,
artifactpb.FileProcessStatus_FILE_PROCESS_STATUS_UNSPECIFIED,
fmt.Errorf("failed to save converting pipeline metadata: %w", err)
}

// convert the pdf file to md
requesterUID := file.RequesterUID
convertedMD, err := wp.svc.ConvertToMDPipe(ctx, file.CreatorUID, requesterUID, base64Data, artifactpb.FileType(artifactpb.FileType_value[file.Type]))
convertedMD, err := wp.svc.ConvertToMDPipe(ctx, file.UID, file.CreatorUID, requesterUID, base64Data, artifactpb.FileType(artifactpb.FileType_value[file.Type]))
if err != nil {
logger.Error("Failed to convert pdf to md using pdf-to-md pipeline.", zap.String("File path", fileInMinIOPath))
return nil, artifactpb.FileProcessStatus_FILE_PROCESS_STATUS_UNSPECIFIED, err
Expand Down Expand Up @@ -558,15 +548,15 @@ func (wp *fileToEmbWorkerPool) processChunkingFile(ctx context.Context, file rep
switch file.Type {
case artifactpb.FileType_FILE_TYPE_XLSX.String(),
artifactpb.FileType_FILE_TYPE_XLS.String(),
artifactpb.FileType_FILE_TYPE_CSV.String():
artifactpb.FileType_FILE_TYPE_CSV.String(),
artifactpb.FileType_FILE_TYPE_HTML.String():
requesterUID := file.RequesterUID
chunks, err = wp.svc.SplitMarkdownPipe(ctx, file.CreatorUID, requesterUID, string(convertedFileData))
case artifactpb.FileType_FILE_TYPE_PDF.String(),
artifactpb.FileType_FILE_TYPE_DOCX.String(),
artifactpb.FileType_FILE_TYPE_DOC.String(),
artifactpb.FileType_FILE_TYPE_PPTX.String(),
artifactpb.FileType_FILE_TYPE_PPT.String(),
artifactpb.FileType_FILE_TYPE_HTML.String():
artifactpb.FileType_FILE_TYPE_PPT.String():
requesterUID := file.RequesterUID
chunks, err = wp.svc.SplitTextPipe(ctx, file.CreatorUID, requesterUID, string(convertedFileData))
}
Expand Down
Loading