Skip to content

Commit

Permalink
refactor: ♻️ rename chunkText to chunk function
Browse files Browse the repository at this point in the history
  • Loading branch information
pelikhan committed Oct 23, 2024
1 parent 7c91515 commit 34b1113
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 27 deletions.
10 changes: 6 additions & 4 deletions packages/core/src/encoders.test.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import test, { describe } from "node:test"
import assert from "node:assert"
import { chunkText, resolveTokenEncoder } from "./encoders"
import { chunk, resolveTokenEncoder } from "./encoders"

describe("resolveTokenEncoder", () => {
test("gpt-3.5-turbo", async () => {
Expand Down Expand Up @@ -29,8 +29,10 @@ describe("resolveTokenEncoder", () => {
assert.deepEqual(result, [27, 91, 321, 13707, 91, 29])
})
test("gpt-4o chunk", async () => {
const { chunks } = await chunkText(
`---
const chunks = await chunk(
{
filename: "markdown.md",
content: `---
title: What is Markdown? - Understanding Markdown Syntax
description: Learn about Markdown, a lightweight markup language for formatting plain text, its syntax, and how it differs from WYSIWYG editors.
keywords: Markdown, markup language, formatting, plain text, syntax
Expand All @@ -43,11 +45,11 @@ Using Markdown is different than using a WYSIWYG editor. In an application like
For example, to denote a heading, you add a number sign before it (e.g., # Heading One). Or to make a phrase bold, you add two asterisks before and after it (e.g., **this text is bold**). It may take a while to get used to seeing Markdown syntax in your text, especially if you’re accustomed to WYSIWYG applications. The screenshot below shows a Markdown file displayed in the Visual Studio Code text editor....
`,
},
{
chunkSize: 15,
chunkOverlap: 2,
model: "gpt-4o",
filename: "markdown.md",
}
)
assert.equal(chunks.length, 21)
Expand Down
33 changes: 21 additions & 12 deletions packages/core/src/encoders.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ import { parseModelIdentifier } from "./models"
import { runtimeHost } from "./host"
import path from "node:path"
import { addLineNumbers, indexToLineNumber } from "./liner"
import { resolveFileContent } from "./file"
import { NotSupportedError } from "./error"

/**
* Resolves the appropriate token encoder based on the given model ID.
Expand Down Expand Up @@ -35,18 +37,25 @@ export async function resolveTokenEncoder(modelId: string): Promise<Tokenizer> {
}
}

export async function chunkText(
text: string,
export async function chunk(
file: Awaitable<string | WorkspaceFile>,
options?: TextChunkerConfig
): Promise<{
model: string
docType: string
chunks: TextChunk[]
}> {
): Promise<TextChunk[]> {
const f = await file
let filename: string
let content: string
if (typeof f === "string") {
filename = undefined
content = f
} else if (typeof f === "object") {
await resolveFileContent(f)
filename = f.filename
content = f.content
} else return []

const {
model,
docType: optionsDocType,
filename,
lineNumbers,
...rest
} = options || {}
Expand All @@ -62,10 +71,10 @@ export async function chunkText(
docType,
tokenizer,
})
const chunksRaw = ts.split(text)
const chunksRaw = ts.split(content)
const chunks = chunksRaw.map(({ tokens, startPos, endPos }) => {
const lineStart = indexToLineNumber(text, startPos)
const lineEnd = indexToLineNumber(text, endPos)
const lineStart = indexToLineNumber(content, startPos)
const lineEnd = indexToLineNumber(content, endPos)
let chunkText = tokenizer.decode(tokens || [])
if (lineNumbers)
chunkText = addLineNumbers(chunkText, { startLine: lineStart })
Expand All @@ -75,5 +84,5 @@ export async function chunkText(
lineEnd,
} satisfies TextChunk
})
return { model: tokenizer.model, docType, chunks }
return chunks
}
6 changes: 2 additions & 4 deletions packages/core/src/globals.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,10 @@ import { JSONLStringify, JSONLTryParse } from "./jsonl"
import { HTMLTablesToJSON, HTMLToMarkdown, HTMLToText } from "./html"
import { CancelError } from "./error"
import { fetchText } from "./fetch"
import { readText } from "./fs"
import { logVerbose } from "./util"
import { GitHubClient } from "./github"
import { GitClient } from "./git"
import { estimateTokens, truncateTextToTokens } from "./tokens"
import { chunkText, resolveTokenEncoder } from "./encoders"
import { chunk, resolveTokenEncoder } from "./encoders"
import { runtimeHost } from "./host"

/**
Expand Down Expand Up @@ -136,7 +134,7 @@ export function installGlobals() {
)
return await truncateTextToTokens(text, maxTokens, encoder, options)
},
chunk: chunkText,
chunk: chunk,
})

/**
Expand Down
9 changes: 2 additions & 7 deletions packages/core/src/types/prompt_template.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1136,7 +1136,6 @@ interface TextChunkerConfig {
| "rst"
| "rust"
>
filename?: string
}

interface Tokenizers {
Expand Down Expand Up @@ -1172,13 +1171,9 @@ interface Tokenizers {
* @param options
*/
chunk(
text: string,
file: Awaitable<string | WorkspaceFile>,
options?: TextChunkerConfig
): Promise<{
model: string
docType: string
chunks: TextChunk[]
}>
): Promise<TextChunk[]>
}

interface HashOptions {
Expand Down

0 comments on commit 34b1113

Please sign in to comment.