From 279c241668b9040ceb30104fc2b692062ccce9ed Mon Sep 17 00:00:00 2001 From: Gary Date: Wed, 13 Nov 2024 21:24:25 +0800 Subject: [PATCH] fix(artifact): update htlm file process (#129) Because there is a bug original file process for html This commit update the html file process --- pkg/service/pipeline.go | 16 ++++++++++++---- pkg/worker/worker.go | 18 ++++-------------- 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/pkg/service/pipeline.go b/pkg/service/pipeline.go index df09f58..9887a46 100644 --- a/pkg/service/pipeline.go +++ b/pkg/service/pipeline.go @@ -41,7 +41,7 @@ const QAPipelineID = "retrieving-qna" const QAVersion = "v1.2.0" // ConvertToMDPipe using converting pipeline to convert some file type to MD and consume caller's credits -func (s *Service) ConvertToMDPipe(ctx context.Context, caller uuid.UUID, requester uuid.UUID, fileBase64 string, fileType artifactPb.FileType) (string, error) { +func (s *Service) ConvertToMDPipe(ctx context.Context, fileUID uuid.UUID, caller uuid.UUID, requester uuid.UUID, fileBase64 string, fileType artifactPb.FileType) (string, error) { logger, _ := logger.GetZapLogger(ctx) var md metadata.MD if requester != uuid.Nil { @@ -71,15 +71,15 @@ func (s *Service) ConvertToMDPipe(ctx context.Context, caller uuid.UUID, request artifactPb.FileType_FILE_TYPE_DOCX, artifactPb.FileType_FILE_TYPE_DOC, artifactPb.FileType_FILE_TYPE_PPT, - artifactPb.FileType_FILE_TYPE_PPTX, - artifactPb.FileType_FILE_TYPE_HTML: + artifactPb.FileType_FILE_TYPE_PPTX: pipelineID = ConvertDocToMDPipelineID2 version = DocToMDVersion2 // Spreadsheet types and others use the original pipeline case artifactPb.FileType_FILE_TYPE_XLSX, artifactPb.FileType_FILE_TYPE_XLS, - artifactPb.FileType_FILE_TYPE_CSV: + artifactPb.FileType_FILE_TYPE_CSV, + artifactPb.FileType_FILE_TYPE_HTML: pipelineID = ConvertDocToMDPipelineID version = DocToMDVersion @@ -87,6 +87,14 @@ func (s *Service) ConvertToMDPipe(ctx context.Context, caller uuid.UUID, request return "", fmt.Errorf("unsupported file type: %v", fileType) } + // save the converting pipeline metadata into database + convertingPipelineMetadata := NamespaceID + "/" + pipelineID + "@" + version + err := s.Repository.UpdateKbFileExtraMetaData(ctx, fileUID, "", convertingPipelineMetadata, "", "", nil, nil, nil, nil) + if err != nil { + logger.Error("Failed to save converting pipeline metadata.", zap.String("File uid:", fileUID.String())) + return "", fmt.Errorf("failed to save converting pipeline metadata: %w", err) + } + req := &pipelinePb.TriggerNamespacePipelineReleaseRequest{ NamespaceId: NamespaceID, PipelineId: pipelineID, diff --git a/pkg/worker/worker.go b/pkg/worker/worker.go index 094c0e7..5db5c42 100644 --- a/pkg/worker/worker.go +++ b/pkg/worker/worker.go @@ -453,19 +453,9 @@ func (wp *fileToEmbWorkerPool) processConvertingFile(ctx context.Context, file r // encode data to base64 base64Data := base64.StdEncoding.EncodeToString(data) - // save the converting pipeline metadata into database - convertingPipelineMetadata := service.NamespaceID + "/" + service.ConvertDocToMDPipelineID + "@" + service.DocToMDVersion - err = wp.svc.Repository.UpdateKbFileExtraMetaData(ctx, file.UID, "", convertingPipelineMetadata, "", "", nil, nil, nil, nil) - if err != nil { - logger.Error("Failed to save converting pipeline metadata.", zap.String("File uid:", file.UID.String())) - return nil, - artifactpb.FileProcessStatus_FILE_PROCESS_STATUS_UNSPECIFIED, - fmt.Errorf("failed to save converting pipeline metadata: %w", err) - } - // convert the pdf file to md requesterUID := file.RequesterUID - convertedMD, err := wp.svc.ConvertToMDPipe(ctx, file.CreatorUID, requesterUID, base64Data, artifactpb.FileType(artifactpb.FileType_value[file.Type])) + convertedMD, err := wp.svc.ConvertToMDPipe(ctx, file.UID, file.CreatorUID, requesterUID, base64Data, artifactpb.FileType(artifactpb.FileType_value[file.Type])) if err != nil { logger.Error("Failed to convert pdf to md using pdf-to-md pipeline.", zap.String("File path", fileInMinIOPath)) return nil, artifactpb.FileProcessStatus_FILE_PROCESS_STATUS_UNSPECIFIED, err @@ -558,15 +548,15 @@ func (wp *fileToEmbWorkerPool) processChunkingFile(ctx context.Context, file rep switch file.Type { case artifactpb.FileType_FILE_TYPE_XLSX.String(), artifactpb.FileType_FILE_TYPE_XLS.String(), - artifactpb.FileType_FILE_TYPE_CSV.String(): + artifactpb.FileType_FILE_TYPE_CSV.String(), + artifactpb.FileType_FILE_TYPE_HTML.String(): requesterUID := file.RequesterUID chunks, err = wp.svc.SplitMarkdownPipe(ctx, file.CreatorUID, requesterUID, string(convertedFileData)) case artifactpb.FileType_FILE_TYPE_PDF.String(), artifactpb.FileType_FILE_TYPE_DOCX.String(), artifactpb.FileType_FILE_TYPE_DOC.String(), artifactpb.FileType_FILE_TYPE_PPTX.String(), - artifactpb.FileType_FILE_TYPE_PPT.String(), - artifactpb.FileType_FILE_TYPE_HTML.String(): + artifactpb.FileType_FILE_TYPE_PPT.String(): requesterUID := file.RequesterUID chunks, err = wp.svc.SplitTextPipe(ctx, file.CreatorUID, requesterUID, string(convertedFileData)) }