diff --git a/packages/core/src/encoders.test.ts b/packages/core/src/encoders.test.ts index c225188131..11992690ae 100644 --- a/packages/core/src/encoders.test.ts +++ b/packages/core/src/encoders.test.ts @@ -1,6 +1,6 @@ import test, { describe } from "node:test" import assert from "node:assert" -import { chunkText, resolveTokenEncoder } from "./encoders" +import { chunk, resolveTokenEncoder } from "./encoders" describe("resolveTokenEncoder", () => { test("gpt-3.5-turbo", async () => { @@ -29,8 +29,10 @@ describe("resolveTokenEncoder", () => { assert.deepEqual(result, [27, 91, 321, 13707, 91, 29]) }) test("gpt-4o chunk", async () => { - const { chunks } = await chunkText( - `--- + const chunks = await chunk( + { + filename: "markdown.md", + content: `--- title: What is Markdown? - Understanding Markdown Syntax description: Learn about Markdown, a lightweight markup language for formatting plain text, its syntax, and how it differs from WYSIWYG editors. keywords: Markdown, markup language, formatting, plain text, syntax @@ -43,11 +45,11 @@ Using Markdown is different than using a WYSIWYG editor. In an application like For example, to denote a heading, you add a number sign before it (e.g., # Heading One). Or to make a phrase bold, you add two asterisks before and after it (e.g., **this text is bold**). It may take a while to get used to seeing Markdown syntax in your text, especially if you’re accustomed to WYSIWYG applications. The screenshot below shows a Markdown file displayed in the Visual Studio Code text editor.... `, + }, { chunkSize: 15, chunkOverlap: 2, model: "gpt-4o", - filename: "markdown.md", } ) assert.equal(chunks.length, 21) diff --git a/packages/core/src/encoders.ts b/packages/core/src/encoders.ts index 9937a68f12..d5dffbcc91 100644 --- a/packages/core/src/encoders.ts +++ b/packages/core/src/encoders.ts @@ -3,6 +3,8 @@ import { parseModelIdentifier } from "./models" import { runtimeHost } from "./host" import path from "node:path" import { addLineNumbers, indexToLineNumber } from "./liner" +import { resolveFileContent } from "./file" +import { NotSupportedError } from "./error" /** * Resolves the appropriate token encoder based on the given model ID. @@ -35,18 +37,25 @@ export async function resolveTokenEncoder(modelId: string): Promise { } } -export async function chunkText( - text: string, +export async function chunk( + file: Awaitable, options?: TextChunkerConfig -): Promise<{ - model: string - docType: string - chunks: TextChunk[] -}> { +): Promise { + const f = await file + let filename: string + let content: string + if (typeof f === "string") { + filename = undefined + content = f + } else if (typeof f === "object") { + await resolveFileContent(f) + filename = f.filename + content = f.content + } else return [] + const { model, docType: optionsDocType, - filename, lineNumbers, ...rest } = options || {} @@ -62,10 +71,10 @@ export async function chunkText( docType, tokenizer, }) - const chunksRaw = ts.split(text) + const chunksRaw = ts.split(content) const chunks = chunksRaw.map(({ tokens, startPos, endPos }) => { - const lineStart = indexToLineNumber(text, startPos) - const lineEnd = indexToLineNumber(text, endPos) + const lineStart = indexToLineNumber(content, startPos) + const lineEnd = indexToLineNumber(content, endPos) let chunkText = tokenizer.decode(tokens || []) if (lineNumbers) chunkText = addLineNumbers(chunkText, { startLine: lineStart }) @@ -75,5 +84,5 @@ export async function chunkText( lineEnd, } satisfies TextChunk }) - return { model: tokenizer.model, docType, chunks } + return chunks } diff --git a/packages/core/src/globals.ts b/packages/core/src/globals.ts index 90813942b4..41992dba93 100644 --- a/packages/core/src/globals.ts +++ b/packages/core/src/globals.ts @@ -12,12 +12,10 @@ import { JSONLStringify, JSONLTryParse } from "./jsonl" import { HTMLTablesToJSON, HTMLToMarkdown, HTMLToText } from "./html" import { CancelError } from "./error" import { fetchText } from "./fetch" -import { readText } from "./fs" -import { logVerbose } from "./util" import { GitHubClient } from "./github" import { GitClient } from "./git" import { estimateTokens, truncateTextToTokens } from "./tokens" -import { chunkText, resolveTokenEncoder } from "./encoders" +import { chunk, resolveTokenEncoder } from "./encoders" import { runtimeHost } from "./host" /** @@ -136,7 +134,7 @@ export function installGlobals() { ) return await truncateTextToTokens(text, maxTokens, encoder, options) }, - chunk: chunkText, + chunk: chunk, }) /** diff --git a/packages/core/src/types/prompt_template.d.ts b/packages/core/src/types/prompt_template.d.ts index 69e04c1386..1658edc793 100644 --- a/packages/core/src/types/prompt_template.d.ts +++ b/packages/core/src/types/prompt_template.d.ts @@ -1136,7 +1136,6 @@ interface TextChunkerConfig { | "rst" | "rust" > - filename?: string } interface Tokenizers { @@ -1172,13 +1171,9 @@ interface Tokenizers { * @param options */ chunk( - text: string, + file: Awaitable, options?: TextChunkerConfig - ): Promise<{ - model: string - docType: string - chunks: TextChunk[] - }> + ): Promise } interface HashOptions {