Skip to content

Commit

Permalink
fix(artifact): update htlm file process (#129)
Browse files Browse the repository at this point in the history
Because

there is a bug original file process for html

This commit

update the html file process
  • Loading branch information
Yougigun authored Nov 13, 2024
1 parent b5be01b commit 279c241
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 18 deletions.
16 changes: 12 additions & 4 deletions pkg/service/pipeline.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ const QAPipelineID = "retrieving-qna"
const QAVersion = "v1.2.0"

// ConvertToMDPipe using converting pipeline to convert some file type to MD and consume caller's credits
func (s *Service) ConvertToMDPipe(ctx context.Context, caller uuid.UUID, requester uuid.UUID, fileBase64 string, fileType artifactPb.FileType) (string, error) {
func (s *Service) ConvertToMDPipe(ctx context.Context, fileUID uuid.UUID, caller uuid.UUID, requester uuid.UUID, fileBase64 string, fileType artifactPb.FileType) (string, error) {
logger, _ := logger.GetZapLogger(ctx)
var md metadata.MD
if requester != uuid.Nil {
Expand Down Expand Up @@ -71,22 +71,30 @@ func (s *Service) ConvertToMDPipe(ctx context.Context, caller uuid.UUID, request
artifactPb.FileType_FILE_TYPE_DOCX,
artifactPb.FileType_FILE_TYPE_DOC,
artifactPb.FileType_FILE_TYPE_PPT,
artifactPb.FileType_FILE_TYPE_PPTX,
artifactPb.FileType_FILE_TYPE_HTML:
artifactPb.FileType_FILE_TYPE_PPTX:
pipelineID = ConvertDocToMDPipelineID2
version = DocToMDVersion2

// Spreadsheet types and others use the original pipeline
case artifactPb.FileType_FILE_TYPE_XLSX,
artifactPb.FileType_FILE_TYPE_XLS,
artifactPb.FileType_FILE_TYPE_CSV:
artifactPb.FileType_FILE_TYPE_CSV,
artifactPb.FileType_FILE_TYPE_HTML:
pipelineID = ConvertDocToMDPipelineID
version = DocToMDVersion

default:
return "", fmt.Errorf("unsupported file type: %v", fileType)
}

// save the converting pipeline metadata into database
convertingPipelineMetadata := NamespaceID + "/" + pipelineID + "@" + version
err := s.Repository.UpdateKbFileExtraMetaData(ctx, fileUID, "", convertingPipelineMetadata, "", "", nil, nil, nil, nil)
if err != nil {
logger.Error("Failed to save converting pipeline metadata.", zap.String("File uid:", fileUID.String()))
return "", fmt.Errorf("failed to save converting pipeline metadata: %w", err)
}

req := &pipelinePb.TriggerNamespacePipelineReleaseRequest{
NamespaceId: NamespaceID,
PipelineId: pipelineID,
Expand Down
18 changes: 4 additions & 14 deletions pkg/worker/worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -453,19 +453,9 @@ func (wp *fileToEmbWorkerPool) processConvertingFile(ctx context.Context, file r
// encode data to base64
base64Data := base64.StdEncoding.EncodeToString(data)

// save the converting pipeline metadata into database
convertingPipelineMetadata := service.NamespaceID + "/" + service.ConvertDocToMDPipelineID + "@" + service.DocToMDVersion
err = wp.svc.Repository.UpdateKbFileExtraMetaData(ctx, file.UID, "", convertingPipelineMetadata, "", "", nil, nil, nil, nil)
if err != nil {
logger.Error("Failed to save converting pipeline metadata.", zap.String("File uid:", file.UID.String()))
return nil,
artifactpb.FileProcessStatus_FILE_PROCESS_STATUS_UNSPECIFIED,
fmt.Errorf("failed to save converting pipeline metadata: %w", err)
}

// convert the pdf file to md
requesterUID := file.RequesterUID
convertedMD, err := wp.svc.ConvertToMDPipe(ctx, file.CreatorUID, requesterUID, base64Data, artifactpb.FileType(artifactpb.FileType_value[file.Type]))
convertedMD, err := wp.svc.ConvertToMDPipe(ctx, file.UID, file.CreatorUID, requesterUID, base64Data, artifactpb.FileType(artifactpb.FileType_value[file.Type]))
if err != nil {
logger.Error("Failed to convert pdf to md using pdf-to-md pipeline.", zap.String("File path", fileInMinIOPath))
return nil, artifactpb.FileProcessStatus_FILE_PROCESS_STATUS_UNSPECIFIED, err
Expand Down Expand Up @@ -558,15 +548,15 @@ func (wp *fileToEmbWorkerPool) processChunkingFile(ctx context.Context, file rep
switch file.Type {
case artifactpb.FileType_FILE_TYPE_XLSX.String(),
artifactpb.FileType_FILE_TYPE_XLS.String(),
artifactpb.FileType_FILE_TYPE_CSV.String():
artifactpb.FileType_FILE_TYPE_CSV.String(),
artifactpb.FileType_FILE_TYPE_HTML.String():
requesterUID := file.RequesterUID
chunks, err = wp.svc.SplitMarkdownPipe(ctx, file.CreatorUID, requesterUID, string(convertedFileData))
case artifactpb.FileType_FILE_TYPE_PDF.String(),
artifactpb.FileType_FILE_TYPE_DOCX.String(),
artifactpb.FileType_FILE_TYPE_DOC.String(),
artifactpb.FileType_FILE_TYPE_PPTX.String(),
artifactpb.FileType_FILE_TYPE_PPT.String(),
artifactpb.FileType_FILE_TYPE_HTML.String():
artifactpb.FileType_FILE_TYPE_PPT.String():
requesterUID := file.RequesterUID
chunks, err = wp.svc.SplitTextPipe(ctx, file.CreatorUID, requesterUID, string(convertedFileData))
}
Expand Down

0 comments on commit 279c241

Please sign in to comment.