diff --git a/connectors/src/connectors/webcrawler/temporal/activities.ts b/connectors/src/connectors/webcrawler/temporal/activities.ts index 29a96a854ba0..5b7925116bff 100644 --- a/connectors/src/connectors/webcrawler/temporal/activities.ts +++ b/connectors/src/connectors/webcrawler/temporal/activities.ts @@ -2,6 +2,7 @@ import type { CoreAPIDataSourceDocumentSection } from "@dust-tt/types"; import type { ModelId } from "@dust-tt/types"; import { WEBCRAWLER_MAX_DEPTH, WEBCRAWLER_MAX_PAGES } from "@dust-tt/types"; import { stripNullBytes } from "@dust-tt/types"; +import { validateUrl } from "@dust-tt/types/src/shared/utils/url_utils"; import { Context } from "@temporalio/activity"; import { isCancellation } from "@temporalio/workflow"; import { CheerioCrawler, Configuration, LogLevel } from "crawlee"; @@ -317,14 +318,26 @@ export async function crawlWebsiteByConnectorId(connectorId: ModelId) { extracted.length > 0 && extracted.length <= MAX_SMALL_DOCUMENT_TXT_LEN ) { + const formattedDocumentContent = formatDocumentContent({ + title: pageTitle, + content: extracted, + url: request.url, + }); + if (!formattedDocumentContent) { + childLogger.info( + { + documentId, + configId: webCrawlerConfig.id, + url, + }, + `Invalid document or URL. Skipping` + ); + return; + } await upsertToDatasource({ dataSourceConfig, documentId: documentId, - documentContent: formatDocumentContent({ - title: pageTitle, - content: extracted, - url: request.url, - }), + documentContent: formattedDocumentContent, documentUrl: request.url, timestampMs: new Date().getTime(), tags: [`title:${stripNullBytes(pageTitle)}`], @@ -466,18 +479,25 @@ function formatDocumentContent({ title: string; content: string; url: string; -}): CoreAPIDataSourceDocumentSection { +}): CoreAPIDataSourceDocumentSection | null { const URL_MAX_LENGTH = 128; const TITLE_MAX_LENGTH = 300; - const parsedUrl = new URL(url); + + const validatedUrl = validateUrl(url); + if (!validatedUrl.valid || !validatedUrl.standardized) { + return null; + } + + const parsedUrl = new URL(validatedUrl.standardized); const urlWithoutQuery = `${parsedUrl.origin}/${parsedUrl.pathname}`; const sanitizedContent = stripNullBytes(content); const sanitizedTitle = stripNullBytes(title); + const sanitizedUrlWithoutQuery = stripNullBytes(urlWithoutQuery); return { - prefix: `URL: ${urlWithoutQuery.slice(0, URL_MAX_LENGTH)}${ - urlWithoutQuery.length > URL_MAX_LENGTH ? "..." : "" + prefix: `URL: ${sanitizedUrlWithoutQuery.slice(0, URL_MAX_LENGTH)}${ + sanitizedUrlWithoutQuery.length > URL_MAX_LENGTH ? "..." : "" }\n`, content: `TITLE: ${sanitizedTitle.substring(0, TITLE_MAX_LENGTH)}\n${sanitizedContent}`, sections: [], diff --git a/front/lib/api/data_sources.ts b/front/lib/api/data_sources.ts index d3696692c4c9..d242809d6839 100644 --- a/front/lib/api/data_sources.ts +++ b/front/lib/api/data_sources.ts @@ -34,6 +34,7 @@ import { Ok, sectionFullText, } from "@dust-tt/types"; +import { validateUrl } from "@dust-tt/types/src/shared/utils/url_utils"; import assert from "assert"; import type { Transaction } from "sequelize"; @@ -50,7 +51,6 @@ import type { SpaceResource } from "@app/lib/resources/space_resource"; import { generateRandomModelSId } from "@app/lib/resources/string_ids"; import { ServerSideTracking } from "@app/lib/tracking/server"; import { enqueueUpsertTable } from "@app/lib/upsert_queue"; -import { validateUrl } from "@app/lib/utils"; import logger from "@app/logger/logger"; import { launchScrubDataSourceWorkflow } from "@app/poke/temporal/client"; diff --git a/front/lib/utils.ts b/front/lib/utils.ts index 9dccc15e280a..ea0258990662 100644 --- a/front/lib/utils.ts +++ b/front/lib/utils.ts @@ -89,26 +89,6 @@ export function formatTimestampToFriendlyDate( }); } -export const validateUrl = ( - urlString: string -): { - valid: boolean; - standardized: string | null; -} => { - let url: URL; - try { - url = new URL(urlString); - } catch (e) { - return { valid: false, standardized: null }; - } - - if (url.protocol !== "http:" && url.protocol !== "https:") { - return { valid: false, standardized: null }; - } - - return { valid: true, standardized: url.href }; -}; - // from http://emailregex.com/ const EMAIL_REGEX = /^(([^<>()[\]\\.,;:\s@"]+(\.[^<>()[\]\\.,;:\s@"]+)*)|(".+"))@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}])|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}))$/; diff --git a/front/pages/api/v1/w/[wId]/spaces/[spaceId]/data_sources/[dsId]/documents/[documentId]/index.ts b/front/pages/api/v1/w/[wId]/spaces/[spaceId]/data_sources/[dsId]/documents/[documentId]/index.ts index 5d173c986c56..c7e1c040c177 100644 --- a/front/pages/api/v1/w/[wId]/spaces/[spaceId]/data_sources/[dsId]/documents/[documentId]/index.ts +++ b/front/pages/api/v1/w/[wId]/spaces/[spaceId]/data_sources/[dsId]/documents/[documentId]/index.ts @@ -11,6 +11,7 @@ import { rateLimiter, sectionFullText, } from "@dust-tt/types"; +import { validateUrl } from "@dust-tt/types/src/shared/utils/url_utils"; import type { NextApiRequest, NextApiResponse } from "next"; import { withPublicAPIAuthentication } from "@app/lib/api/auth_wrappers"; @@ -23,7 +24,6 @@ import { enqueueUpsertDocument, runPostUpsertHooks, } from "@app/lib/upsert_queue"; -import { validateUrl } from "@app/lib/utils"; import logger from "@app/logger/logger"; import { apiError, statsDClient } from "@app/logger/withlogging"; import { launchRunPostDeleteHooksWorkflow } from "@app/temporal/documents_post_process_hooks/client"; diff --git a/types/src/shared/utils/url_utils.ts b/types/src/shared/utils/url_utils.ts new file mode 100644 index 000000000000..77390f4631a4 --- /dev/null +++ b/types/src/shared/utils/url_utils.ts @@ -0,0 +1,19 @@ +export const validateUrl = ( + urlString: string +): { + valid: boolean; + standardized: string | null; +} => { + let url: URL; + try { + url = new URL(urlString); + } catch (e) { + return { valid: false, standardized: null }; + } + + if (url.protocol !== "http:" && url.protocol !== "https:") { + return { valid: false, standardized: null }; + } + + return { valid: true, standardized: url.href }; +};