Skip to content

Commit

Permalink
cli fuzz api (#469)
Browse files Browse the repository at this point in the history
* add cli fuzz command

* typo

* search all files by default

* fix test cases
  • Loading branch information
pelikhan authored May 21, 2024
1 parent 4145da1 commit 56a6009
Show file tree
Hide file tree
Showing 21 changed files with 208 additions and 51 deletions.
8 changes: 7 additions & 1 deletion docs/genaisrc/genaiscript.d.ts

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

31 changes: 28 additions & 3 deletions docs/src/content/docs/reference/cli/commands.md
Original file line number Diff line number Diff line change
Expand Up @@ -260,8 +260,11 @@ Options:
Commands:
index [options] <file...> Index a set of documents
search [options] <query> [files...] Search index
search [options] <query> [files...] Search using vector embeddings
similarity
clear [options] Clear index to force re-indexing
fuzz [options] <query> [files...] Search using string distance
code
help [command] display help for command
```

Expand Down Expand Up @@ -290,11 +293,11 @@ Options:
```
Usage: genaiscript retrieval search [options] <query> [files...]
Search index
Search using vector embeddings similarity
Options:
-ef, --excluded-files <string...> excluded files
-tk, --top-k <number> maximum number of embeddings
-tk, --top-k <number> maximum number of results
-n, --name <string> index name
-h, --help display help for command
```
Expand All @@ -311,6 +314,28 @@ Options:
-h, --help display help for command
```

### `retrieval fuzz`

```
Usage: genaiscript retrieval fuzz [options] <query> [files...]
Search using string distance
Options:
-ef, --excluded-files <string...> excluded files
-tk, --top-k <number> maximum number of results
-h, --help display help for command
```

### `retrieval code`

```
Usage: genaiscript retrieval code [options]
Options:
-h, --help display help for command
```

## `serve`

```
Expand Down
8 changes: 7 additions & 1 deletion genaisrc/genaiscript.d.ts

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 13 additions & 4 deletions packages/cli/src/cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ import { runScript } from "./run"
import { batchScript } from "./batch"
import {
retrievalClear,
retrievalFuzz,
retrievalIndex,
retrievalSearch,
retrievalTokens,
} from "./retrieval"
import { helpAll } from "./help"
import {
Expand All @@ -32,6 +32,7 @@ import {
parseFence,
parseHTMLToText,
parsePDF,
parseTokens,
} from "./parse"
import { compileScript, createScript, fixScripts, listScripts } from "./scripts"
import { codeQuery } from "./codequery"
Expand Down Expand Up @@ -255,17 +256,25 @@ export async function cli() {
.action(retrievalIndex)
retrieval
.command("search")
.description("Search index")
.description("Search using vector embeddings similarity")
.arguments("<query> [files...]")
.option("-ef, --excluded-files <string...>", "excluded files")
.option("-tk, --top-k <number>", "maximum number of embeddings")
.option("-tk, --top-k <number>", "maximum number of results")
.option("-n, --name <string>", "index name")
.action(retrievalSearch)
retrieval
.command("clear")
.description("Clear index to force re-indexing")
.option("-n, --name <string>", "index name")
.action(retrievalClear)
retrieval
.command("fuzz")
.description("Search using string distance")
.arguments("<query> [files...]")
.option("-ef, --excluded-files <string...>", "excluded files")
.option("-tk, --top-k <number>", "maximum number of results")
.action(retrievalFuzz)
retrieval.command("code")

program
.command("serve")
Expand Down Expand Up @@ -310,7 +319,7 @@ export async function cli() {
.description("Count tokens in a set of files")
.arguments("<files...>")
.option("-ef, --excluded-files <string...>", "excluded files")
.action(retrievalTokens)
.action(parseTokens)
parser
.command("jsonl2json", "Converts JSONL files to a JSON file")
.argument("<file...>", "input JSONL files")
Expand Down
25 changes: 25 additions & 0 deletions packages/cli/src/parse.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ import {
writeText,
parsePdf,
HTMLToText,
estimateTokens,
readText,
} from "genaiscript-core"
import { createProgressSpinner } from "./spinner"
import replaceExt from "replace-ext"
Expand Down Expand Up @@ -56,3 +58,26 @@ export async function jsonl2json(files: string[]) {
}
spinner.stop()
}

export async function parseTokens(
filesGlobs: string[],
options: { excludedFiles: string[]; model: string }
) {
const { model = "gpt4" } = options || {}

const files = await expandFiles(filesGlobs, options?.excludedFiles)
const progress = createProgressSpinner(`parsing ${files.length} files`)
let text = ""
for (const file of files) {
const content = await readText(file)
if (content) {
const tokens = estimateTokens(model, content)
progress.report({
message: `${file}, ${tokens}`,
})
text += `${file}, ${tokens}\n`
}
}
progress.stop()
console.log(text)
}
42 changes: 21 additions & 21 deletions packages/cli/src/retrieval.ts
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
import {
YAMLStringify,
readText,
upsertVector,
vectorSearch,
clearVectorIndex,
estimateTokens,
normalizeInt,
expandFiles,
normalizeFloat,
fuzzSearch,
} from "genaiscript-core"
import { createProgressSpinner } from "./spinner"

Expand Down Expand Up @@ -79,25 +78,26 @@ export async function retrievalSearch(
console.log(YAMLStringify(res))
}

export async function retrievalTokens(
export async function retrievalFuzz(
q: string,
filesGlobs: string[],
options: { excludedFiles: string[]; model: string }
) {
const { model = "gpt4" } = options || {}

const print = (file: string, content: string) =>
console.log(
`${file}, ${content.length} chars, ${estimateTokens(model, content)} tokens`
)

const files = await expandFiles(filesGlobs, options?.excludedFiles)
let text = ""
for (const file of files) {
const content = await readText(file)
if (content) {
print(file, content)
text += content
}
options: {
excludedFiles: string[]
topK: string
}
print("total", text)
) {
let { excludedFiles, topK } = options || {}
if (!filesGlobs?.length) filesGlobs = ["**"]
if (!excludedFiles?.length) excludedFiles = ["**/node_modules/**"]
const files = await expandFiles(filesGlobs, excludedFiles)
const progress = createProgressSpinner(
`searching '${q}' in ${files.length} files`
)
const res = await fuzzSearch(
q,
files.map((filename) => ({ filename })),
{ topK: normalizeInt(topK) }
)
progress.stop()
console.log(YAMLStringify(res))
}
1 change: 0 additions & 1 deletion packages/core/src/file.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ import {
CSV_REGEX,
DOCX_REGEX,
PDF_REGEX,
XLSX_MIME_TYPE,
XLSX_REGEX,
} from "./constants"
import { tidyData } from "./tidy"
Expand Down
18 changes: 11 additions & 7 deletions packages/core/src/fuzzsearch.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ export async function fuzzSearch(
files: WorkspaceFile[],
options?: FuzzSearchOptions & TraceOptions
): Promise<WorkspaceFileWithScore[]> {
const { trace, ...otherOptions } = options || {}
const { trace, topK, ...otherOptions } = options || {}
// load all files
for (const file of files) await resolveFileContent(file)

Expand All @@ -22,10 +22,14 @@ export async function fuzzSearch(
await miniSearch.addAllAsync(files.filter((f) => !!f.content))

// Search for documents:
const results = miniSearch.search(query)
return results.map((r) => (<WorkspaceFileWithScore>{
filename: r.id,
content: r.content,
score: r.score,
}))
let results = miniSearch.search(query)
if (topK > 0) results = results.slice(0, topK)
return results.map(
(r) =>
<WorkspaceFileWithScore>{
filename: r.id,
content: r.content,
score: r.score,
}
)
}
8 changes: 7 additions & 1 deletion packages/core/src/genaisrc/genaiscript.d.ts

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 7 additions & 1 deletion packages/core/src/types/prompt_template.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1085,6 +1085,10 @@ interface FuzzSearchOptions {
* meaningful results, but can excessively impact search performance.
*/
maxFuzzy?: number
/**
* Maximum number of results to return
*/
topK?: number
}

interface Retrieval {
Expand Down Expand Up @@ -1156,7 +1160,9 @@ interface DataFilter {
distinct?: string[]
}

interface DefDataOptions extends Omit<ContextExpansionOptions, "maxTokens">, DataFilter {
interface DefDataOptions
extends Omit<ContextExpansionOptions, "maxTokens">,
DataFilter {
/**
* Output format in the prompt. Defaults to markdownified CSV
*/
Expand Down
8 changes: 7 additions & 1 deletion packages/sample/genaisrc/genaiscript.d.ts

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 7 additions & 1 deletion packages/sample/genaisrc/node/genaiscript.d.ts

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 56a6009

Please sign in to comment.