From 0d53618fc00ae87979b19098cab5a47b29250521 Mon Sep 17 00:00:00 2001 From: Philippe Rolet Date: Fri, 10 Jan 2025 08:51:01 +0100 Subject: [PATCH] [Document / Table API] Enforce required title and mimetype on upsert (#9877) * enforce on front / types * enforce in core * table fix * clean * fix rust --- core/bin/core_api.rs | 8 +-- core/src/data_sources/data_source.rs | 8 +-- core/src/stores/postgres.rs | 4 +- core/src/stores/store.rs | 4 +- .../data_source/DocumentUploadOrEditModal.tsx | 2 +- .../data_source/TableUploadOrEditModal.tsx | 1 + front/lib/api/data_sources.ts | 8 +-- front/lib/api/tables.ts | 4 +- front/lib/upsert_queue.ts | 70 ++++++++----------- .../[dsId]/documents/[documentId]/index.ts | 11 +-- .../data_sources/[dsId]/tables/csv.ts | 5 +- sdks/js/src/types.ts | 8 +-- .../front/api_handlers/public/data_sources.ts | 70 ++++++++----------- types/src/front/lib/core_api.ts | 12 ++-- 14 files changed, 99 insertions(+), 116 deletions(-) diff --git a/core/bin/core_api.rs b/core/bin/core_api.rs index 78fd6bb8e837..0732ae9fc7c3 100644 --- a/core/bin/core_api.rs +++ b/core/bin/core_api.rs @@ -1686,8 +1686,8 @@ struct DataSourcesDocumentsUpsertPayload { section: Section, credentials: run::Credentials, light_document_output: Option, - title: Option, - mime_type: Option, + title: String, + mime_type: String, } async fn data_sources_documents_upsert( @@ -2191,8 +2191,8 @@ struct DatabasesTablesUpsertPayload { remote_database_secret_id: Option, // Node meta: - title: Option, - mime_type: Option, + title: String, + mime_type: String, } async fn tables_upsert( diff --git a/core/src/data_sources/data_source.rs b/core/src/data_sources/data_source.rs index e0581f5729c6..be0a79e8890a 100644 --- a/core/src/data_sources/data_source.rs +++ b/core/src/data_sources/data_source.rs @@ -633,8 +633,8 @@ impl DataSource { store: Box, qdrant_clients: QdrantClients, document_id: &str, - title: Option, - mime_type: Option, + title: String, + mime_type: String, timestamp: Option, tags: &Vec, parents: &Vec, @@ -716,8 +716,8 @@ impl DataSource { &self.internal_id, document_id, timestamp, - title.as_deref().unwrap_or(document_id), - mime_type.as_deref().unwrap_or("application/octet-stream"), + title.as_str(), + mime_type.as_str(), &tags, &parents.get(1).cloned(), parents, diff --git a/core/src/stores/postgres.rs b/core/src/stores/postgres.rs index 6e9a54322de5..ad85ed938dee 100644 --- a/core/src/stores/postgres.rs +++ b/core/src/stores/postgres.rs @@ -2700,7 +2700,7 @@ impl Store for PostgresStore { } }; - let title = upsert_params.title.unwrap_or(upsert_params.name.clone()); + let title = upsert_params.title; let table = Table::new( project, @@ -2712,7 +2712,7 @@ impl Store for PostgresStore { upsert_params.description, upsert_params.timestamp, title, - upsert_params.mime_type.unwrap_or("text/csv".to_string()), + upsert_params.mime_type, upsert_params.tags, upsert_params.parents.get(1).cloned(), upsert_params.parents, diff --git a/core/src/stores/store.rs b/core/src/stores/store.rs index 6c734a7d7ace..c047487044d1 100644 --- a/core/src/stores/store.rs +++ b/core/src/stores/store.rs @@ -67,8 +67,8 @@ pub struct TableUpsertParams { pub parents: Vec, pub remote_database_table_id: Option, pub remote_database_secret_id: Option, - pub title: Option, - pub mime_type: Option, + pub title: String, + pub mime_type: String, } pub struct FolderUpsertParams { diff --git a/front/components/data_source/DocumentUploadOrEditModal.tsx b/front/components/data_source/DocumentUploadOrEditModal.tsx index a33744c56c1a..e665a712aa52 100644 --- a/front/components/data_source/DocumentUploadOrEditModal.tsx +++ b/front/components/data_source/DocumentUploadOrEditModal.tsx @@ -147,7 +147,7 @@ export const DocumentUploadOrEditModal = ({ const body = { name: initialId ?? document.name, title: initialId ?? document.name, - mime_type: document.mimeType ?? undefined, + mime_type: document.mimeType ?? "application/octet-stream", timestamp: null, parent_id: null, parents: [initialId ?? document.name], diff --git a/front/components/data_source/TableUploadOrEditModal.tsx b/front/components/data_source/TableUploadOrEditModal.tsx index 2d28e7d73221..29ad4a9f30dd 100644 --- a/front/components/data_source/TableUploadOrEditModal.tsx +++ b/front/components/data_source/TableUploadOrEditModal.tsx @@ -111,6 +111,7 @@ export const TableUploadOrEditModal = ({ truncate: true, useAppForHeaderDetection, title: table.name, + mimeType: "text/csv", timestamp: undefined, tags: undefined, parentId: undefined, diff --git a/front/lib/api/data_sources.ts b/front/lib/api/data_sources.ts index bcb964d5e4c6..ff23edbe56dd 100644 --- a/front/lib/api/data_sources.ts +++ b/front/lib/api/data_sources.ts @@ -237,8 +237,8 @@ export type UpsertDocumentArgs = { light_document_output?: boolean; dataSource: DataSourceResource; auth: Authenticator; - mime_type?: string; - title?: string; + mime_type: string; + title: string; }; export async function upsertDocument({ name, @@ -439,8 +439,8 @@ export type UpsertTableArgs = { dataSource: DataSourceResource; auth: Authenticator; useAppForHeaderDetection?: boolean; - title?: string; - mimeType?: string; + title: string; + mimeType: string; }; export async function upsertTable({ tableId, diff --git a/front/lib/api/tables.ts b/front/lib/api/tables.ts index 9203c5ce2fbc..46584f31a44a 100644 --- a/front/lib/api/tables.ts +++ b/front/lib/api/tables.ts @@ -155,8 +155,8 @@ export async function upsertTableFromCsv({ truncate: boolean; useAppForHeaderDetection: boolean; detectedHeaders?: DetectedHeadersType; - title?: string; - mimeType?: string; + title: string; + mimeType: string; }): Promise> { const csvRowsRes = csv ? await rowsFromCsv({ diff --git a/front/lib/upsert_queue.ts b/front/lib/upsert_queue.ts index 8d3288f91c62..049ad900fa3b 100644 --- a/front/lib/upsert_queue.ts +++ b/front/lib/upsert_queue.ts @@ -16,51 +16,43 @@ import { launchUpsertTableWorkflow } from "@app/temporal/upsert_tables/client"; const { DUST_UPSERT_QUEUE_BUCKET, SERVICE_ACCOUNT } = process.env; -export const EnqueueUpsertDocument = t.intersection([ - t.type({ - workspaceId: t.string, - dataSourceId: t.string, - documentId: t.string, - tags: t.union([t.array(t.string), t.null]), - parentId: t.union([t.string, t.null, t.undefined]), - parents: t.union([t.array(t.string), t.null]), - sourceUrl: t.union([t.string, t.null]), - timestamp: t.union([t.number, t.null]), - section: FrontDataSourceDocumentSection, - upsertContext: t.union([UpsertContextSchema, t.null]), - }), - t.partial({ - title: t.string, - mimeType: t.string, - }), -]); +export const EnqueueUpsertDocument = t.type({ + workspaceId: t.string, + dataSourceId: t.string, + documentId: t.string, + tags: t.union([t.array(t.string), t.null]), + parentId: t.union([t.string, t.null, t.undefined]), + parents: t.union([t.array(t.string), t.null]), + sourceUrl: t.union([t.string, t.null]), + timestamp: t.union([t.number, t.null]), + section: FrontDataSourceDocumentSection, + upsertContext: t.union([UpsertContextSchema, t.null]), + title: t.string, + mimeType: t.string, +}); const DetectedHeaders = t.type({ header: t.array(t.string), rowIndex: t.number, }); -export const EnqueueUpsertTable = t.intersection([ - t.type({ - workspaceId: t.string, - dataSourceId: t.string, - tableId: t.string, - tableName: t.string, - tableDescription: t.string, - tableTimestamp: t.union([t.number, t.undefined, t.null]), - tableTags: t.union([t.array(t.string), t.undefined, t.null]), - tableParentId: t.union([t.string, t.undefined, t.null]), - tableParents: t.union([t.array(t.string), t.undefined, t.null]), - csv: t.union([t.string, t.null]), - truncate: t.boolean, - useAppForHeaderDetection: t.union([t.boolean, t.undefined, t.null]), - detectedHeaders: t.union([DetectedHeaders, t.undefined]), - }), - t.partial({ - title: t.string, - mimeType: t.string, - }), -]); +export const EnqueueUpsertTable = t.type({ + workspaceId: t.string, + dataSourceId: t.string, + tableId: t.string, + tableName: t.string, + tableDescription: t.string, + tableTimestamp: t.union([t.number, t.undefined, t.null]), + tableTags: t.union([t.array(t.string), t.undefined, t.null]), + tableParentId: t.union([t.string, t.undefined, t.null]), + tableParents: t.union([t.array(t.string), t.undefined, t.null]), + csv: t.union([t.string, t.null]), + truncate: t.boolean, + useAppForHeaderDetection: t.union([t.boolean, t.undefined, t.null]), + detectedHeaders: t.union([DetectedHeaders, t.undefined]), + title: t.string, + mimeType: t.string, +}); type EnqueueUpsertDocumentType = t.TypeOf; diff --git a/front/pages/api/v1/w/[wId]/spaces/[spaceId]/data_sources/[dsId]/documents/[documentId]/index.ts b/front/pages/api/v1/w/[wId]/spaces/[spaceId]/data_sources/[dsId]/documents/[documentId]/index.ts index 7908d97bc6ec..ca54b4d26b6d 100644 --- a/front/pages/api/v1/w/[wId]/spaces/[spaceId]/data_sources/[dsId]/documents/[documentId]/index.ts +++ b/front/pages/api/v1/w/[wId]/spaces/[spaceId]/data_sources/[dsId]/documents/[documentId]/index.ts @@ -511,6 +511,9 @@ async function handler( const documentId = req.query.documentId as string; + const title = r.data.title ?? "Untitled document"; + const mimeType = r.data.mime_type ?? "application/octet-stream"; + if (r.data.async === true) { const enqueueRes = await enqueueUpsertDocument({ upsertDocument: { @@ -524,8 +527,8 @@ async function handler( sourceUrl, section, upsertContext: r.data.upsert_context || null, - title: r.data.title ?? undefined, - mimeType: r.data.mime_type ?? undefined, + title, + mimeType, }, }); if (enqueueRes.isErr()) { @@ -565,8 +568,8 @@ async function handler( section, credentials, lightDocumentOutput: r.data.light_document_output === true, - title: r.data.title, - mimeType: r.data.mime_type, + title, + mimeType, }); if (upsertRes.isErr()) { diff --git a/front/pages/api/v1/w/[wId]/spaces/[spaceId]/data_sources/[dsId]/tables/csv.ts b/front/pages/api/v1/w/[wId]/spaces/[spaceId]/data_sources/[dsId]/tables/csv.ts index a65e87d1c2ef..fe299c08c0bf 100644 --- a/front/pages/api/v1/w/[wId]/spaces/[spaceId]/data_sources/[dsId]/tables/csv.ts +++ b/front/pages/api/v1/w/[wId]/spaces/[spaceId]/data_sources/[dsId]/tables/csv.ts @@ -115,10 +115,7 @@ async function handler( } const upsertRes = await handleDataSourceTableCSVUpsert({ auth, - params: { - ...r.data, - title: r.data.title ?? r.data.name, - }, + params: r.data, dataSource, }); diff --git a/sdks/js/src/types.ts b/sdks/js/src/types.ts index e17a342925b6..f18a9a84fbc9 100644 --- a/sdks/js/src/types.ts +++ b/sdks/js/src/types.ts @@ -1980,8 +1980,8 @@ export const UpsertTableFromCsvRequestSchema = z.intersection( truncate: z.boolean(), useAppForHeaderDetection: z.boolean().nullable().optional(), async: z.boolean().optional(), - title: z.string().optional(), - mimeType: z.string().optional(), + title: z.string(), + mimeType: z.string(), }) .transform((o) => ({ name: o.name, @@ -2048,8 +2048,8 @@ export const UpsertDatabaseTableRequestSchema = z.object({ parents: z.array(z.string()).nullable().optional(), remote_database_table_id: z.string().nullable().optional(), remote_database_secret_id: z.string().nullable().optional(), - title: z.string().optional(), - mime_type: z.string().optional(), + title: z.string(), + mime_type: z.string(), }); export type UpsertDatabaseTableRequestType = z.infer< diff --git a/types/src/front/api_handlers/public/data_sources.ts b/types/src/front/api_handlers/public/data_sources.ts index aae9fc0c2589..515265aa072c 100644 --- a/types/src/front/api_handlers/public/data_sources.ts +++ b/types/src/front/api_handlers/public/data_sources.ts @@ -27,24 +27,20 @@ export type FrontDataSourceDocumentSectionType = t.TypeOf< typeof FrontDataSourceDocumentSection >; -export const PostDataSourceDocumentRequestBodySchema = t.intersection([ - t.type({ - timestamp: t.union([t.Int, t.undefined, t.null]), - tags: t.union([t.array(t.string), t.undefined, t.null]), - parent_id: t.union([t.string, t.undefined, t.null]), - parents: t.union([t.array(t.string), t.undefined, t.null]), - source_url: t.union([t.string, t.undefined, t.null]), - upsert_context: t.union([UpsertContextSchema, t.undefined, t.null]), - text: t.union([t.string, t.undefined, t.null]), - section: t.union([FrontDataSourceDocumentSection, t.undefined, t.null]), - light_document_output: t.union([t.boolean, t.undefined]), - async: t.union([t.boolean, t.undefined, t.null]), - }), - t.partial({ - title: t.string, - mime_type: t.string, - }), -]); +export const PostDataSourceDocumentRequestBodySchema = t.type({ + timestamp: t.union([t.Int, t.undefined, t.null]), + tags: t.union([t.array(t.string), t.undefined, t.null]), + parent_id: t.union([t.string, t.undefined, t.null]), + parents: t.union([t.array(t.string), t.undefined, t.null]), + source_url: t.union([t.string, t.undefined, t.null]), + upsert_context: t.union([UpsertContextSchema, t.undefined, t.null]), + text: t.union([t.string, t.undefined, t.null]), + section: t.union([FrontDataSourceDocumentSection, t.undefined, t.null]), + light_document_output: t.union([t.boolean, t.undefined]), + async: t.union([t.boolean, t.undefined, t.null]), + title: t.string, + mime_type: t.string, +}); export type PostDataSourceDocumentRequestBody = t.TypeOf< typeof PostDataSourceDocumentRequestBodySchema @@ -66,24 +62,20 @@ export type PatchDataSourceWithNameDocumentRequestBody = t.TypeOf< typeof PostDataSourceWithNameDocumentRequestBodySchema >; -export const PatchDataSourceTableRequestBodySchema = t.intersection([ - t.type({ - name: t.string, - description: t.string, - timestamp: t.union([t.number, t.undefined, t.null]), - tags: t.union([t.array(t.string), t.undefined, t.null]), - parentId: t.union([t.string, t.undefined, t.null]), - parents: t.union([t.array(t.string), t.undefined, t.null]), - truncate: t.boolean, - async: t.union([t.boolean, t.undefined]), - csv: t.union([t.string, t.undefined]), - useAppForHeaderDetection: t.union([t.boolean, t.undefined]), - }), - t.partial({ - title: t.string, - mimeType: t.string, - }), -]); +export const PatchDataSourceTableRequestBodySchema = t.type({ + name: t.string, + description: t.string, + timestamp: t.union([t.number, t.undefined, t.null]), + tags: t.union([t.array(t.string), t.undefined, t.null]), + parentId: t.union([t.string, t.undefined, t.null]), + parents: t.union([t.array(t.string), t.undefined, t.null]), + truncate: t.boolean, + async: t.union([t.boolean, t.undefined]), + csv: t.union([t.string, t.undefined]), + useAppForHeaderDetection: t.union([t.boolean, t.undefined]), + title: t.string, + mimeType: t.string, +}); export type PatchDataSourceTableRequestBody = t.TypeOf< typeof PatchDataSourceTableRequestBodySchema @@ -112,6 +104,8 @@ export const UpsertTableFromCsvRequestSchema = t.intersection([ truncate: t.boolean, useAppForHeaderDetection: t.union([t.boolean, t.undefined, t.null]), async: t.union([t.boolean, t.undefined]), + title: t.string, + mimeType: t.string, }), // csv is optional when editing an existing table. t.union([ @@ -121,10 +115,6 @@ export const UpsertTableFromCsvRequestSchema = t.intersection([ tableId: t.string, }), ]), - t.partial({ - title: t.string, - mimeType: t.string, - }), ]); export type UpsertTableFromCsvRequestType = t.TypeOf< diff --git a/types/src/front/lib/core_api.ts b/types/src/front/lib/core_api.ts index e66806c09977..3569938d9cf3 100644 --- a/types/src/front/lib/core_api.ts +++ b/types/src/front/lib/core_api.ts @@ -855,8 +855,8 @@ export class CoreAPI { section: CoreAPIDataSourceDocumentSection; credentials: CredentialsType; lightDocumentOutput?: boolean; - title?: string | null; - mimeType?: string | null; + title: string; + mimeType: string; }): Promise< CoreAPIResponse<{ document: @@ -886,8 +886,8 @@ export class CoreAPI { source_url: sourceUrl, credentials, light_document_output: lightDocumentOutput, - title: title ?? null, - mime_type: mimeType ?? null, + title: title, + mime_type: mimeType, }), } ); @@ -1125,8 +1125,8 @@ export class CoreAPI { parents: string[]; remoteDatabaseTableId?: string | null; remoteDatabaseSecretId?: string | null; - title?: string; - mimeType?: string; + title: string; + mimeType: string; }): Promise> { const response = await this._fetchWithError( `${this._url}/projects/${encodeURIComponent(