Skip to content

Commit

Permalink
adding zips to parsers (#461)
Browse files Browse the repository at this point in the history
* adding zips to parsers

* add fflate

* add unzip tests

* updated docs

* added zip support
  • Loading branch information
pelikhan authored May 17, 2024
1 parent 875b789 commit f99c51e
Show file tree
Hide file tree
Showing 23 changed files with 13,745 additions and 2 deletions.
10 changes: 10 additions & 0 deletions docs/genaisrc/genaiscript.d.ts

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions docs/src/content/docs/reference/scripts/parsers.md
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,14 @@ const res = parsers.XLSX("...filename.xlsx", {
})
```

## Unzip

Unpacks the contents of a zip file and returns an array of files.

```js
const files = await parsers.unzip(env.files[0])
```

## HTML to Text

The `parsers.HTMLToText` converts HTML to plain text using [html-to-text](https://www.npmjs.com/package/html-to-text).
Expand Down
10 changes: 10 additions & 0 deletions genaisrc/genaiscript.d.ts

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions packages/core/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
"esbuild": "^0.21.3",
"fast-xml-parser": "^4.3.6",
"fetch-retry": "^6.0.0",
"fflate": "^0.8.2",
"file-type": "^19.0.0",
"gpt-tokenizer": "^2.1.2",
"html-escaper": "^3.0.3",
Expand Down
10 changes: 10 additions & 0 deletions packages/core/src/genaisrc/genaiscript.d.ts

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

35 changes: 33 additions & 2 deletions packages/core/src/parsers.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,24 @@ import assert from "node:assert/strict"
import { createParsers } from "./parsers"
import { MarkdownTrace } from "./trace"
import { XSLXParse } from "./xslx"
import { readFile } from "fs/promises"
import { readFile, writeFile } from "fs/promises"
import { resolve } from "path"
import {
setHost,
Host,
AskUserOptions,
LogLevel,
ModelService,
OAIToken,
ParseService,
ReadFileOptions,
RetrievalService,
ServerManager,
ShellCallOptions,
UTF8Decoder,
UTF8Encoder,
} from "./host"
import { TestHost } from "./testhost"

describe("parsers", () => {
let trace: MarkdownTrace
Expand All @@ -15,6 +31,7 @@ describe("parsers", () => {
trace = new MarkdownTrace()
model = "test model"
parsers = createParsers({ trace, model })
TestHost.install()
})

test("JSON5", () => {
Expand Down Expand Up @@ -43,7 +60,9 @@ describe("parsers", () => {
})

test("XSLX", async () => {
const result = XSLXParse(await readFile(resolve("./src/parsers.test.xlsx")))
const result = XSLXParse(
await readFile(resolve("./src/parsers.test.xlsx"))
)
assert.deepStrictEqual(result, [{ key: 1, value: 2 }])
})

Expand All @@ -52,6 +71,18 @@ describe("parsers", () => {
assert.deepStrictEqual(result, { key: "value" })
})

test("zip", async () => {
const result = await parsers.unzip(
{
filename: "./src/parsers.test.zip",
content: undefined,
},
{ glob: "*.md" }
)
assert(result.find((f) => f.filename === "markdown.md"))
assert(!result.find((f) => f.filename === "loremipsum.pdf"))
})

test("math", () => {
const res = parsers.math("1 + 3")
assert.strictEqual(res, 4)
Expand Down
Binary file added packages/core/src/parsers.test.zip
Binary file not shown.
3 changes: 3 additions & 0 deletions packages/core/src/parsers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import { MathTryEvaluate } from "./math"
import { validateJSONWithSchema } from "./schema"
import { XSLXTryParse } from "./xslx"
import { host } from "./host"
import { unzip } from "./zip"

export function createParsers(options: {
trace: MarkdownTrace
Expand Down Expand Up @@ -49,6 +50,8 @@ export function createParsers(options: {
dotEnv: (text) => dotEnvTryParse(filenameOrFileToContent(text)),
INI: (text, options) =>
INITryParse(filenameOrFileToContent(text), options?.defaultValue),
unzip: async (file, options) =>
await unzip(await host.readFile(file.filename), options),
tokens: (text) => estimateTokens(model, filenameOrFileToContent(text)),
fences: (text) => extractFenced(filenameOrFileToContent(text)),
annotations: (text) => parseAnnotations(filenameOrFileToContent(text)),
Expand Down
96 changes: 96 additions & 0 deletions packages/core/src/testhost.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import { readFile, writeFile } from "fs/promises"
import {
setHost,
Host,
AskUserOptions,
LogLevel,
ModelService,
OAIToken,
ParseService,
ReadFileOptions,
RetrievalService,
ServerManager,
ShellCallOptions,
UTF8Decoder,
UTF8Encoder,
} from "./host"
import { resolve } from "path"

export class TestHost implements Host {
userState: any
parser: ParseService
retrieval: RetrievalService
models: ModelService
server: ServerManager
path: Path
workspace: WorkspaceFileSystem

static install() {
setHost(new TestHost())
}

createUTF8Decoder(): UTF8Decoder {
return new TextDecoder("utf-8")
}
createUTF8Encoder(): UTF8Encoder {
return new TextEncoder()
}
projectFolder(): string {
throw new Error("Method not implemented.")
}
installFolder(): string {
throw new Error("Method not implemented.")
}
resolvePath(...segments: string[]): string {
throw new Error("Method not implemented.")
}
readSecret(name: string): Promise<string> {
throw new Error("Method not implemented.")
}
getSecretToken(modelId: string): Promise<OAIToken> {
throw new Error("Method not implemented.")
}
log(level: LogLevel, msg: string): void {
throw new Error("Method not implemented.")
}
async readFile(
name: string,
options?: ReadFileOptions
): Promise<Uint8Array> {
return new Uint8Array(await readFile(resolve(name)))
}
async writeFile(name: string, content: Uint8Array): Promise<void> {
await writeFile(resolve(name), content)
}
deleteFile(name: string): Promise<void> {
throw new Error("Method not implemented.")
}
findFiles(glob: string): Promise<string[]> {
throw new Error("Method not implemented.")
}
clearVirtualFiles(): void {
throw new Error("Method not implemented.")
}
setVirtualFile(name: string, content: string): void {
throw new Error("Method not implemented.")
}
isVirtualFile(name: string): boolean {
throw new Error("Method not implemented.")
}
createDirectory(name: string): Promise<void> {
throw new Error("Method not implemented.")
}
deleteDirectory(name: string): Promise<void> {
throw new Error("Method not implemented.")
}
askUser(options: AskUserOptions): Promise<string> {
throw new Error("Method not implemented.")
}
exec(
command: string,
args: string[],
options: ShellCallOptions
): Promise<Partial<ShellOutput>> {
throw new Error("Method not implemented.")
}
}
10 changes: 10 additions & 0 deletions packages/core/src/types/prompt_template.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -747,6 +747,9 @@ interface ParseXLSXOptions {
range?: string
}

interface ParseZipOptions {
glob?: string
}

interface Parsers {
/**
Expand Down Expand Up @@ -852,6 +855,13 @@ interface Parsers {
options?: HTMLToTextOptions
): string

/**
* Extracts the contents of a zip archive file
* @param file
* @param options
*/
unzip(file: WorkspaceFile, options?: ParseZipOptions): Promise<WorkspaceFile[]>

/**
* Estimates the number of tokens in the content.
* @param content content to tokenize
Expand Down
28 changes: 28 additions & 0 deletions packages/core/src/zip.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import { unzipSync } from "fflate"
import { minimatch } from "minimatch"
import { lookupMime } from "./mime"
import { isBinaryMimeType } from "./parser"
import { host } from "./host"

export async function unzip(
data: Uint8Array,
options?: ParseZipOptions
): Promise<WorkspaceFile[]> {
const { glob } = options || {}
const res = unzipSync(data, {
filter: (file: { name: string; originalSize: number }) => {
if (glob)
return minimatch(file.name, glob, {
windowsPathsNoEscape: true,
})
return true
},
})
const decoder = host.createUTF8Decoder()
return Object.entries(res).map(([filename, data]) => {
const mime = lookupMime(filename)
if (isBinaryMimeType(mime))
return <WorkspaceFile>{ filename } // TODO bytes support
else return <WorkspaceFile>{ filename, content: decoder.decode(data) }
})
}
10 changes: 10 additions & 0 deletions packages/sample/genaisrc/genaiscript.d.ts

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 10 additions & 0 deletions packages/sample/genaisrc/node/genaiscript.d.ts

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 10 additions & 0 deletions packages/sample/genaisrc/python/genaiscript.d.ts

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit f99c51e

Please sign in to comment.