Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Decode .tar.gz archives using liblzma (via WebAssembly) #173

Merged
merged 4 commits into from
Oct 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 21 additions & 1 deletion src/domain/release-archive.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import path from "node:path";
import { parse as parseUrl } from "node:url";
import tar from "tar";
import { UserFacingError } from "./error.js";
import { decompress as decompressXz } from "../infrastructure/xzdec/xzdec.js";

import { ModuleFile } from "./module-file.js";

Expand Down Expand Up @@ -61,7 +62,7 @@ export class ReleaseArchive {
public async extractModuleFile(): Promise<ModuleFile> {
this.extractDir = path.dirname(this._diskPath);

if (this._diskPath.endsWith(".tar.gz")) {
if (this.isSupportedTarball()) {
await this.extractReleaseTarball(this.extractDir);
} else if (this._diskPath.endsWith(".zip")) {
await this.extractReleaseZip(this.extractDir);
Expand All @@ -81,7 +82,26 @@ export class ReleaseArchive {
return new ModuleFile(extractedModulePath);
}

private isSupportedTarball(): boolean {
if (this._diskPath.endsWith(".tar.gz")) {
return true;
}
if (this._diskPath.endsWith(".tar.xz")) {
return true;
}
return false;
}

private async extractReleaseTarball(extractDir: string): Promise<void> {
if (this._diskPath.endsWith(".tar.xz")) {
const reader = fs.createReadStream(this._diskPath);
const writer = tar.x({
cwd: extractDir
});
await decompressXz(reader, writer);
return;
}

await tar.x({
cwd: extractDir,
file: this._diskPath,
Expand Down
2 changes: 2 additions & 0 deletions src/infrastructure/xzdec/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
/bazel-*
/MODULE.bazel.lock
33 changes: 33 additions & 0 deletions src/infrastructure/xzdec/BUILD.bazel
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
load("@aspect_bazel_lib//lib:write_source_files.bzl", "write_source_file")
load(":wasm.bzl", "wasm_binary")

cc_binary(
name = "xzdec",
srcs = ["xzdec.c"],
linkopts = [
"-nostdlib",
"-lc",
"-Wl,--no-entry",
],
tags = ["manual"],
deps = ["@xz//:lzma"],
)

wasm_binary(
name = "xzdec_wasm",
out = "xzdec.wasm",
lib = ":xzdec",
)

genrule(
name = "xzdec_wasm_gz",
srcs = [":xzdec_wasm"],
outs = ["xzdec_wasm_gz/xzdec.wasm.gz"],
cmd = "cat $< | gzip -9 -k -n > $@",
)

write_source_file(
name = "write_xzdec_wasm_gz_to_source_tree",
in_file = ":xzdec_wasm_gz",
out_file = "xzdec.wasm.gz",
)
45 changes: 45 additions & 0 deletions src/infrastructure/xzdec/MODULE.bazel
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
module(name = "publish-to-bcr")

bazel_dep(name = "aspect_bazel_lib", version = "2.9.3")
bazel_dep(name = "platforms", version = "0.0.10")
bazel_dep(name = "toolchains_llvm", version = "1.2.0")
bazel_dep(name = "xz", version = "5.4.5.bcr.5")

# https://github.com/bazel-contrib/toolchains_llvm/pull/405
#
# FIXME: Remove when a new `toolchains_llvm` has been released.
git_override(
module_name = "toolchains_llvm",
commit = "bda1c9fbf232b682c30d039f8e4a5e3cf3025d0f",
remote = "https://github.com/bazel-contrib/toolchains_llvm",
)

llvm = use_extension("@toolchains_llvm//toolchain/extensions:llvm.bzl", "llvm")
llvm.toolchain(
libclang_rt = {
"@libclang_rt-wasm32-wasi//:libclang_rt.builtins-wasm32.a": "wasm32-unknown-unknown/libclang_rt.builtins.a",
},
llvm_versions = {
# Pin to an older LLVM version due to a stray Homebrew dependency
# in the macOS build of v19.1.0.
#
# https://github.com/llvm/llvm-project/issues/110070
"": "18.1.8",
},
stdlib = {"wasm32": "libc"},
)
llvm.sysroot(
label = "@wasi-sysroot//sysroots/wasm32-wasip2",
targets = ["wasm32"],
)
use_repo(llvm, "llvm_toolchain")

register_toolchains("@llvm_toolchain//:all")

wasi_sysroot = use_repo_rule("//:wasm.bzl", "wasi_sysroot")

wasm32_libclang_rt = use_repo_rule("//:wasm.bzl", "wasm32_libclang_rt")

wasi_sysroot(name = "wasi-sysroot")

wasm32_libclang_rt(name = "libclang_rt-wasm32-wasi")
33 changes: 33 additions & 0 deletions src/infrastructure/xzdec/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# xz decompressor

This directory contains a WebAssembly module for decompressing an Xz file using
[liblzma], along with a JavaScript wrapper that adapts `xzdec.wasm` for use
with the Node.js [`node:stream`] library.

[liblzma]: https://github.com/tukaani-project/xz/tree/v5.4.5/src/liblzma
[`node:stream`]: https://nodejs.org/docs/latest-v18.x/api/stream.html

Files:
- `xzdec.c` is a thin wrapper around liblzma that exports functions with a
WebAssembly-style ABI. It compiles to `xzdec.wasm`.
- `xzdec.wasm.gz` is a gzip-compressed `xzdec.wasm`, to reduce the size impact
of checking generated build artifacts into Git.
- `xzdec.ts` exports the `decompress(r: stream.Readable, w: stream.Writable)`
function, which instantiates a WebAssembly module from `xzdec.wasm.gz` and
decompresses an Xz bitstream.

When building a new version of `xzdec.wasm.gz`, or verifying that the checked-in
artifact matches the expected output, Bazel should be run with `-c opt` so that
the compiled output is optimized.

```
$ cd src/infrastructure/xzdec
$ bazel build -c opt //:xzdec_wasm_gz
$ diff -s xzdec.wasm.gz bazel-bin/xzdec_wasm_gz/xzdec.wasm.gz
Files xzdec.wasm.gz and bazel-bin/xzdec_wasm_gz/xzdec.wasm.gz are identical
$
```

Note that variations in the gzip compression may cause spurious differences
between `xzdec.wasm.gz` -- in this case, decompressing the two files and
comparing `xzdec.wasm` directly may provide more consistent behavior.
86 changes: 86 additions & 0 deletions src/infrastructure/xzdec/wasm.bzl
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
_WASM_ABIS = [
"wasm32-wasip2",
]

def _platform_transition(settings, attr):
return {"//command_line_option:platforms": str(attr._platform)}

platform_transition = transition(
implementation = _platform_transition,
inputs = [],
outputs = ["//command_line_option:platforms"],
)

def _wasm_binary(ctx):
out = ctx.outputs.out
if not out:
out = ctx.actions.declare_file(ctx.attr.name + ".wasm")
ctx.actions.symlink(output = out, target_file = ctx.file.lib)
return DefaultInfo(files = depset([out]))

wasm_binary = rule(
implementation = _wasm_binary,
attrs = {
"lib": attr.label(
allow_single_file = True,
cfg = platform_transition,
),
"out": attr.output(),
"_platform": attr.label(
default = Label("@toolchains_llvm//platforms:wasm32"),
),
"_allowlist_function_transition": attr.label(
default = "@bazel_tools//tools/allowlists/function_transition_allowlist",
),
},
)

_SYSROOT_BUILD = """
filegroup(
name = {name},
srcs = glob(["include/**/*", "lib/**/*", "share/**/*"], allow_empty=True),
visibility = ["//visibility:public"],
)
"""

def _wasi_sysroot(ctx):
ctx.download_and_extract(
integrity = "sha256-NRcvfSeZSFsVpGsdh/UKWF2RXsZiCA8AXZkVOlCIjwg=",
stripPrefix = "wasi-sysroot-24.0",
url = ["https://github.com/WebAssembly/wasi-sdk/releases/download/wasi-sdk-24/wasi-sysroot-24.0.tar.gz"],
)

ctx.file("BUILD.bazel", "")
ctx.file("sysroots/BUILD.bazel", "")
for abi in _WASM_ABIS:
ctx.file("sysroots/%s/BUILD.bazel" % (abi,), _SYSROOT_BUILD.format(
name = repr(abi),
))
ctx.execute(["mv", "include/" + abi, "sysroots/%s/include" % (abi,)])
ctx.execute(["mv", "lib/" + abi, "sysroots/%s/lib" % (abi,)])
ctx.execute(["mv", "share/" + abi, "sysroots/%s/share" % (abi,)])

wasi_sysroot = repository_rule(
implementation = _wasi_sysroot,
)

def _wasm32_libclang_rt(ctx):
ctx.file("BUILD.bazel", """
exports_files(["libclang_rt.builtins-wasm32.a"])

filegroup(
name = "libclang_rt-wasm32-wasi",
srcs = ["libclang_rt.builtins-wasm32.a"],
visibility = ["//visibility:public"],
)
""")

ctx.download_and_extract(
integrity = "sha256-fjPA33WLkEabHePKFY4tCn9xk01YhFJbpqNy3gs7Dsc=",
stripPrefix = "libclang_rt.builtins-wasm32-wasi-24.0",
url = ["https://github.com/WebAssembly/wasi-sdk/releases/download/wasi-sdk-24/libclang_rt.builtins-wasm32-wasi-24.0.tar.gz"],
)

wasm32_libclang_rt = repository_rule(
implementation = _wasm32_libclang_rt,
)
95 changes: 95 additions & 0 deletions src/infrastructure/xzdec/xzdec.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#include <stdint.h>
#include <stdlib.h>

#include <lzma.h>

typedef uint32_t xzdec_lzma_ret;

struct Xzdec {
lzma_stream stream;
};

__attribute__((export_name("xzdec_allocate")))
uint8_t *xzdec_allocate(uint32_t len) {
return malloc(len);
}

__attribute__((export_name("xzdec_deallocate")))
void xzdec_deallocate(uint8_t *ptr) {
free(ptr);
}

__attribute__((export_name("xzdec_new_stream_decoder")))
xzdec_lzma_ret xzdec_new_stream_decoder(
uint32_t memlimit,
uint32_t flags,
struct Xzdec **xzdec_ptr
) {
lzma_stream stream = LZMA_STREAM_INIT;
lzma_ret rc = lzma_stream_decoder(&stream, memlimit, flags);
if (rc != LZMA_OK) {
return rc;
}
*xzdec_ptr = malloc(sizeof(struct Xzdec));
(*xzdec_ptr)->stream = stream;
return LZMA_OK;
}

__attribute__((export_name("xzdec_drop")))
void xzdec_drop(struct Xzdec *xzdec) {
lzma_end(&(xzdec->stream));
free(xzdec);
}

__attribute__((export_name("xzdec_input_empty")))
uint32_t xzdec_input_empty(struct Xzdec *xzdec) {
if (xzdec->stream.avail_in == 0) {
return 1;
}
return 0;
}

__attribute__((export_name("xzdec_set_input")))
void xzdec_set_input(
struct Xzdec *xzdec,
const uint8_t *input_buf,
uint32_t input_buf_len
) {
xzdec->stream.next_in = input_buf;
xzdec->stream.avail_in = input_buf_len;
}

__attribute__((export_name("xzdec_next_output")))
xzdec_lzma_ret xzdec_next_output(
struct Xzdec *xzdec,
uint8_t *output_buf,
uint32_t output_buf_cap,
uint32_t *output_buf_len
) {
xzdec->stream.next_out = output_buf;
xzdec->stream.avail_out = output_buf_cap;
lzma_ret rc = lzma_code(&(xzdec->stream), LZMA_RUN);
*output_buf_len = output_buf_cap - xzdec->stream.avail_out;
return rc;
}

__attribute__((export_name("xzdec_finish")))
xzdec_lzma_ret xzdec_finish(
struct Xzdec *xzdec,
uint8_t *output_buf,
uint32_t output_buf_cap,
uint32_t *output_buf_len
) {
xzdec->stream.next_out = output_buf;
xzdec->stream.avail_out = output_buf_cap;
lzma_ret rc = lzma_code(&(xzdec->stream), LZMA_FINISH);
*output_buf_len = output_buf_cap - xzdec->stream.avail_out;
return rc;
}

// Prevent Clang from wrapping every inserted function and injecting calls
// to `__wasm_call_dtors()`.
void _initialize() {
void __wasm_call_ctors();
__wasm_call_ctors();
}
Loading