refactor: ♻️ rename chunkText to chunk function

microsoft · Oct 23, 2024 · 34b1113 · 34b1113
1 parent 7c91515
commit 34b1113
Show file tree

Hide file tree

Showing 4 changed files with 31 additions and 27 deletions.
diff --git a/packages/core/src/encoders.test.ts b/packages/core/src/encoders.test.ts
@@ -1,6 +1,6 @@
 import test, { describe } from "node:test"
 import assert from "node:assert"
-import { chunkText, resolveTokenEncoder } from "./encoders"
+import { chunk, resolveTokenEncoder } from "./encoders"
 
 describe("resolveTokenEncoder", () => {
     test("gpt-3.5-turbo", async () => {
@@ -29,8 +29,10 @@ describe("resolveTokenEncoder", () => {
         assert.deepEqual(result, [27, 91, 321, 13707, 91, 29])
     })
     test("gpt-4o chunk", async () => {
-        const { chunks } = await chunkText(
-            `---
+        const chunks = await chunk(
+            {
+                filename: "markdown.md",
+                content: `---
 title: What is Markdown? - Understanding Markdown Syntax
 description: Learn about Markdown, a lightweight markup language for formatting plain text, its syntax, and how it differs from WYSIWYG editors.
 keywords: Markdown, markup language, formatting, plain text, syntax
@@ -43,11 +45,11 @@ Using Markdown is different than using a WYSIWYG editor. In an application like
 
 For example, to denote a heading, you add a number sign before it (e.g., # Heading One). Or to make a phrase bold, you add two asterisks before and after it (e.g., **this text is bold**). It may take a while to get used to seeing Markdown syntax in your text, especially if you’re accustomed to WYSIWYG applications. The screenshot below shows a Markdown file displayed in the Visual Studio Code text editor....
 `,
+            },
             {
                 chunkSize: 15,
                 chunkOverlap: 2,
                 model: "gpt-4o",
-                filename: "markdown.md",
             }
         )
         assert.equal(chunks.length, 21)

diff --git a/packages/core/src/encoders.ts b/packages/core/src/encoders.ts
@@ -3,6 +3,8 @@ import { parseModelIdentifier } from "./models"
 import { runtimeHost } from "./host"
 import path from "node:path"
 import { addLineNumbers, indexToLineNumber } from "./liner"
+import { resolveFileContent } from "./file"
+import { NotSupportedError } from "./error"
 
 /**
  * Resolves the appropriate token encoder based on the given model ID.
@@ -35,18 +37,25 @@ export async function resolveTokenEncoder(modelId: string): Promise<Tokenizer> {
     }
 }
 
-export async function chunkText(
-    text: string,
+export async function chunk(
+    file: Awaitable<string | WorkspaceFile>,
     options?: TextChunkerConfig
-): Promise<{
-    model: string
-    docType: string
-    chunks: TextChunk[]
-}> {
+): Promise<TextChunk[]> {
+    const f = await file
+    let filename: string
+    let content: string
+    if (typeof f === "string") {
+        filename = undefined
+        content = f
+    } else if (typeof f === "object") {
+        await resolveFileContent(f)
+        filename = f.filename
+        content = f.content
+    } else return []
+
     const {
         model,
         docType: optionsDocType,
-        filename,
         lineNumbers,
         ...rest
     } = options || {}
@@ -62,10 +71,10 @@ export async function chunkText(
         docType,
         tokenizer,
     })
-    const chunksRaw = ts.split(text)
+    const chunksRaw = ts.split(content)
     const chunks = chunksRaw.map(({ tokens, startPos, endPos }) => {
-        const lineStart = indexToLineNumber(text, startPos)
-        const lineEnd = indexToLineNumber(text, endPos)
+        const lineStart = indexToLineNumber(content, startPos)
+        const lineEnd = indexToLineNumber(content, endPos)
         let chunkText = tokenizer.decode(tokens || [])
         if (lineNumbers)
             chunkText = addLineNumbers(chunkText, { startLine: lineStart })
@@ -75,5 +84,5 @@ export async function chunkText(
             lineEnd,
         } satisfies TextChunk
     })
-    return { model: tokenizer.model, docType, chunks }
+    return chunks
 }
diff --git a/packages/core/src/globals.ts b/packages/core/src/globals.ts
@@ -12,12 +12,10 @@ import { JSONLStringify, JSONLTryParse } from "./jsonl"
 import { HTMLTablesToJSON, HTMLToMarkdown, HTMLToText } from "./html"
 import { CancelError } from "./error"
 import { fetchText } from "./fetch"
-import { readText } from "./fs"
-import { logVerbose } from "./util"
 import { GitHubClient } from "./github"
 import { GitClient } from "./git"
 import { estimateTokens, truncateTextToTokens } from "./tokens"
-import { chunkText, resolveTokenEncoder } from "./encoders"
+import { chunk, resolveTokenEncoder } from "./encoders"
 import { runtimeHost } from "./host"
 
 /**
@@ -136,7 +134,7 @@ export function installGlobals() {
             )
             return await truncateTextToTokens(text, maxTokens, encoder, options)
         },
-        chunk: chunkText,
+        chunk: chunk,
     })
 
     /**

diff --git a/packages/core/src/types/prompt_template.d.ts b/packages/core/src/types/prompt_template.d.ts
@@ -1136,7 +1136,6 @@ interface TextChunkerConfig {
         | "rst"
         | "rust"
     >
-    filename?: string
 }
 
 interface Tokenizers {
@@ -1172,13 +1171,9 @@ interface Tokenizers {
      * @param options
      */
     chunk(
-        text: string,
+        file: Awaitable<string | WorkspaceFile>,
         options?: TextChunkerConfig
-    ): Promise<{
-        model: string
-        docType: string
-        chunks: TextChunk[]
-    }>
+    ): Promise<TextChunk[]>
 }
 
 interface HashOptions {