From c43bf63846aedf3493ac6e8f4bc9f2bb48401d66 Mon Sep 17 00:00:00 2001 From: Brian Warner Date: Fri, 12 Apr 2024 00:36:26 -0500 Subject: [PATCH] feat(swing-store): budget-limited deletion of snapshot and transcripts Both `snapStore.deleteVatSnapshots()` and `transcriptStore.deleteVatTranscripts()` now take a numeric `budget=` argument, which will limit the number of snapshots or transcript spans deleted in each call. Both return a `{ done, cleanups }` record so the caller knows when to stop calling. This enables the slow deletion of large vats (lots of transcript spans or snapshots), a small number of items at a time. Recommended budget is 5, which (given SwingSet's `snapInterval=200` default) will cause the deletion of 1000 rows from the `transcriptItems` table each call, which shouldn't take more than 100ms. Without this, the kernel's attempt to slowly delete a terminated vat would succeed in slowly draining the kvStore, but would trigger a gigantic SQL transaction at the end, as it deleted every transcript item in the vat's history. The worst-case example I found would be the mainnet chain's v43-walletFactory, which (as of apr-2024) has 8.2M transcript items in 40k spans. A fast machine takes two seconds just to count all the items, and deletion took 22 *minutes*, with a `swingstore.wal` file that peaked at 27 GiB. This would cause an enormous chain stall at some surprising point in time weeks or months after the vat was first terminated. In addition, both the transcript spans and the snapshot records are shadowed into IAVL (via `export-data`) for integrity, and deleting 40k+40k=80k IAVL records in a single block might cause some significant churn too. The kernel should call `transcriptStore.stopUsingTranscript()` and `snapStore.stopUsingLastSnapshot()` as soon as the vat is terminated, to make exports smaller right away (by omitting all transcript/snapshot artifacts for the given vat, even before those DB rows or their export-data records have been deleted). New swing-store documentation was added. refs #8928 Co-authored-by: Richard Gibson --- packages/SwingSet/docs/configuration.md | 2 +- packages/swing-store/docs/bundlestore.md | 30 + packages/swing-store/docs/kvstore.md | 35 ++ packages/swing-store/docs/snapstore.md | 49 ++ packages/swing-store/docs/swingstore.md | 68 ++- packages/swing-store/docs/transcriptstore.md | 70 +++ packages/swing-store/src/snapStore.js | 66 ++- packages/swing-store/src/swingStore.js | 1 + packages/swing-store/src/transcriptStore.js | 135 ++++- packages/swing-store/test/deletion.test.js | 583 ++++++++++++++++++- 10 files changed, 1006 insertions(+), 33 deletions(-) create mode 100644 packages/swing-store/docs/bundlestore.md create mode 100644 packages/swing-store/docs/kvstore.md create mode 100644 packages/swing-store/docs/snapstore.md create mode 100644 packages/swing-store/docs/transcriptstore.md diff --git a/packages/SwingSet/docs/configuration.md b/packages/SwingSet/docs/configuration.md index 49b0d482551..9c7614ea1c6 100644 --- a/packages/SwingSet/docs/configuration.md +++ b/packages/SwingSet/docs/configuration.md @@ -150,7 +150,7 @@ The `snapshotInitial` property is a special snapshot interval that applies only to the vat's very first snapshot. We treat it as a special case because a few of the very first cranks of a vat (which involve initialization) can be quite expensive, and we'd like to be able to promptly capture the benefit having paid -that expense so that future replays don't need to repeat the work. Defaults to 2. +that expense so that future replays don't need to repeat the work. Defaults to 3. The code that realizes a vat or device can be specified in one of five ways: diff --git a/packages/swing-store/docs/bundlestore.md b/packages/swing-store/docs/bundlestore.md new file mode 100644 index 00000000000..661c196b9e3 --- /dev/null +++ b/packages/swing-store/docs/bundlestore.md @@ -0,0 +1,30 @@ +# BundleStore + +The `kernelStorage.bundleStore` sub-store manages code bundles. These can be used to hold vat-worker supervisor code (e.g. the [`@endo/lockdown`](https://github.com/endojs/endo/tree/master/packages/lockdown) bundle, or the [`@agoric/swingset-xsnap-supervisor` package](../../swingset-xsnap-supervisor), which incorporates liveslots), or the initial vat code bundles (for both kernel-defined bundles like vat-comms or vat-timer, or for application-defined bundles like vat-zoe or the ZCF code). It can also hold bundles that will be loaded later by userspace vat code, such as contract bundles. + +Each bundle held by the bundleStore is identified by a secure BundleID, which contains a format version integer and a hash, with a format like `b0-123abc456def...` or `b1-789ghi012...`. This contains enough information to securely define the behavior of the code inside the bundle, and to identify the tools needed to load/evaluate it. + +The bundleStore provides a simple add/get/remove API to the kernel. The kernel adds its own bundles during initialization, and provides the host application with an API to load additional ones in later. The kernel code that creates new vats will read bundles from the bundleStore when necessary, as vats are created. Userspace can get access to "BundleCap" objects that represent bundles, to keep the large bundle blobs out of RAM as much as possible. + +## Data Model + +Bundles are actually JavaScript objects: records of at least `{ moduleFormat }`, plus some format-specific fields like `endoZipBase64` and `endoZipBase64Sha512`. They are created by the [`@endo/bundle-source`](https://github.com/endojs/endo/tree/master/packages/bundle-source) package. Many are consumed by [`@endo/import-bundle`](https://github.com/endojs/endo/tree/master/packages/import-bundle), but the `b0-` format bundles can be loaded with some simple string manipulation and a call to `eval()` (which is how supervisor bundles are injected into new vat workers, before `@endo/import-bundle` is available). + +The bundleStore database treats each bundle as BundleID and a blob of contents. The SQLite `bundles` table is just `(bundleID TEXT, bundle BLOB)`. The bundleStore knows about each `moduleFormat` and how to extract the meaningful data and compress it into a blob, and how to produce the Bundle object during retrieval. + +The bundleStore also knows about the BundleID computation rules. The `addBundle()` API will verify that the contents match the ID, however it currently relies upon the caller to verify e.g. that the bundle does not contain any unexpected properties. The `importSwingStore()` API performs more extensive validation, to prevent corruption during the export+import process. + +The kernel is expected to keep track of which bundles are needed and when (with reference counts), and to not delete a bundle unless it is really unneeded. Currently, this means all bundles are retained forever. + +Unlike the `snapStore`, there is no notion of pruning bundles: either the bundle is present (with all its data), or there is no record of the BundleID at all. + +## Export Model + +Each bundle gets a single export-data entry, whose name is `bundle.${bundleID}`, and whose value is just `${bundleID}`. Each bundle also gets a single export artifact, whose name is `bundle.${bundleID}`, and whose contents are the compressed BLOB from the database (from which a Bundle record can be reconstructed). + +## Slow Deletion + +Since bundles are not owned by vats, there is nothing to delete when a vat is terminated. So unlike `transcriptStore` and `snapStore`, there is no concept of "slow deletion", and no APIs to support it. + +When a bundle is deleted by `bundleStore.deleteBundle()`, its export-data item is deleted immediately, and subsequent exports will omit the corresponding artifact. + diff --git a/packages/swing-store/docs/kvstore.md b/packages/swing-store/docs/kvstore.md new file mode 100644 index 00000000000..a710b4eee70 --- /dev/null +++ b/packages/swing-store/docs/kvstore.md @@ -0,0 +1,35 @@ +# KVStore + +The `kernelStorage.kvStore` sub-store manages a table of arbitrary key-value (string-to-string) pairs. It provides the usual get/set/has/delete APIs, plus a `getNextKey` call to support lexicographic iteration. + +There are three separate sections of the namespace. The normal one is the "consensus" section. Each value written here will be given an export-data row, and incorporated into the "crankhash" (described below). + +The second is "local", and includes any key which is prefixed with `local.`. These keys are *not* given export-data rows, nor are they included in the crankhash. + +The third is "host", and includes any key which is prefixed with `host.`. This is not available to `kernelStorage.kvStore` at all: it is only accessed by methods on `hostStorage.kvStore` (the `kernelStorage` methods will throw an error if given a key like `host.foo`, and the `hostStorage` methods will throw *unless* given a key like `host.foo`). These are also excluded from export-data and the crankhash. Host keys are reserved for the host application, and are generally used to keep track of things like which block has been executed, to manage consistency between a separate host database (eg IAVL) and the swingstore. The host can record "I told the kernel to execute the contents of block 56" into `hostStorage.kvStore`, and then do `hostStorage.commit()`, and then it can record "I processed the rest of block 56" into is own DB, and then commit its own DB. If, upon startup, it observes a discrepancy between the `hostStorage.kvStore` record and its own DB, it knows it got interrupted between these two commit points, which can trigger recovery code. + +Any key which doesn't start with `local.` or `host.` is part of the "consensus" section. + +## CrankHash and ActivityHash + +Swingset kernels are frequently run in a consensus mode, where multiple instances of the kernel (on different machines) are expected to execute the same deliveries in lock-step. In this mode, every kernel is expected to do exactly the same computation, and any divergence indicates a failure (or attempt at malice). We want to detect such variations quickly, so the diverging/failing member can "fall out of consensus" promptly. + +The swingstore hashes all changes to the "consensus" portion of the kvStore into the "crank hash". This hash covers every change since the beginning of the current crank, and the kernel logs the result at the end of each crank, at which point the crankhash is reset. + +Each crank also updates a value called the "activity hash", by hashing the previous activityhash and the latest crankhash together. This records a chain of changes, and is logged at the end of each crank too. + +The host application can record the activityhash into its own consensus-tracking database (eg IAVL) at the end of each kernel run, to ensure that any internal divergence of swingset behavior is escalated to a proper consensus failure. Without this, one instance of the kernel might "think differently" than the others, but still "act" the same (in terms of IO or externally-visible messages) without triggering a failure, which would be a lurking problem. + +Logging both the crankhash and the activityhash improves our ability to diagnose consensus failures. By comparing logs between a "good" machine and a "bad" (diverging) one, we can quickly determine which crank caused the problem, and usually compare slogfile delivery/syscall records to narrow the divergence down to a specific syscall. + +kvStore changes are also recorded by the export-data, but these are too voluminous to be logged, and do not capture multiple changes to the same key. And not all host applications use exports, so there might not be anything watching export data. + +## Data Model + +The kvStore holds a simple string-to-string key/value store. The SQLite schema for the `kvStore` table is simply `(key TEXT, value TEXT)`. + +## Export Model + +To ensure that every key/value pair is correctly validatable, *all* in-consensus kvStore rows get their own export-data item. The name is just `kv.${key}`, and the value is just the value. `kvStore.delete(key)` will delete the export-data item. There are no artifacts. + +These make up the vast majority of the export-data items, both by count and by "churn" (the number of export-data items changed in a single crank). In the future, we would prefer to keep the kvStore in some sort of Merkle-tree data structure, and emit only a handful of export-data rows that contain hashes (perhaps just a single root hash). In this approach, the actual data would be exported in one or more artifacts. However, our SQLite backend does not provide the same kind of automatic Merkleization as IAVL, and only holds a single version of data at a time, making this impractical. diff --git a/packages/swing-store/docs/snapstore.md b/packages/swing-store/docs/snapstore.md new file mode 100644 index 00000000000..de4bff1f2fb --- /dev/null +++ b/packages/swing-store/docs/snapstore.md @@ -0,0 +1,49 @@ +# SnapStore + +The `kernelStorage.snapStore` sub-store tracks vat heap snapshots. These blobs capture the state of an XS JavaScript engine, between deliveries, to speed up replay-based persistence. The kernel can start a vat worker from a recent heap snapshot, and then it only needs to replay a handful of transcript items (deliveries), instead of replaying every delivery since the beginning of the incarnation. + +The XS / [`xsnap`](../../xsnap) engine defines the heap snapshot format. It consists of a large table of "slots", which are linked together to form JavaScript objects, strings, Maps, functions, etc. The snapshot also includes "chunks" for large data fields (like strings and BigInts), a stack, and some other supporting tables. The snapStore doesn't care about any of the internal details: it just gets a big blob of bytes. + +## Data Model + +Each snapshot is compressed and stored in the SQLite row as a BLOB. The snapStore has a single table named `snapshots`, with a schema of `(vatID TEXT, snapPos INTEGER, inUse INTEGER, hash TEXT, uncompressedSize INTEGER, compressedSize INTEGER, compressedSnapshot BLOB)`. + +The kernel has a scheduler which decides when to take a heap snapshot for each vat. There is a tradeoff between the immediate cost of creating the snapshot, versus the expected future savings of having a shorter transcript to replay. More frequent snapshots save time later, at the cost of time spent now. + +The kernel currently uses a [very simple scheduler](../../SwingSet/src/kernel/vat-warehouse.js), which takes a snapshot every `snapshotInterval` deliveries (e.g. 200), plus an extra one a few deliveries (`snapshotInitial`) into the new incarnation, to avoid replaying expensive contract startup code. The [SwingSet configuration documentation](../../SwingSet/docs/configuration.md) has the details. + +However, the swingstore is unaware of the kernel's scheduling policy. Every once in a while, the kernel tells the snapStore about a new snapshot, and the snapStore updates its data. + +Like the [transcriptStore](./transcriptstore.md), the snapStore retains a hash of older records, even after it prunes the snapshot data itself. There is at most one `inUse = 1` record for each vatID, and it will always have the highest `snapPos` value. When a particular vatID's active snapshot is replaced, the SQLite table row is updated to clear the `inUse` flag (i.e. set it to NULL). By default, the `compressedSnapshot` field is also set to NULL, removing the large data blob, but there is an option (`keepSnapshots: true`) to retain the full contents of all snapshots, even the ones that are no longer in use. + +## Export Model + +Each snapshot, both current and historic, gets an export-data entry. The name is `snapshot.${vatID}.${position}`, where `position` is the latest delivery (eg highest delivery number) that was included in the heap state captured by the snapshot. The value is a JSON-serialized record of `{ vatID, snapPos, hash, inUse }`. + +If there is a "current" snapshot, there will be one additional export-data record, whose name is `snapshot.${vatID}.current`, and whose value is `snapshot.${vatID}.${position}`. This value is the same as the name of the latest export-data record, and is meant as a convenient pointer to find that latest snapshot. + +The export *artifacts* will generally only include the current snapshot for each vat. Only the `debug` mode will include historical snapshots (and only if the swingstore was retaining them in the first place). + +## Slow Deletion + +As soon as a vat is terminated, the kernel will call `snapStore.stopUsingLastSnapshot()`. The DB is updated to clear the `inUse` flag of the latest snapshot, leaving no rows with `inUse = 1`. This immediately makes the vat non-loadable by the kernel. The snapshot data itself is deleted (unless `keepSnapshots: true`). + +This also modifies the latest `snapshot.${vatID}.${snapPos}` export-data record, to change `inUse` to 0, and removes the `snapshot.${vatID}.current` record. The modification and deletion are added to the export-data callback queue, so the host-app can learn about them after the next commit. Any subsequent `getExportData()` calls will observe the changes. + +As a result, all non-`debug` swing-store exports after this point will omit any artifacts for the snapshot blob, but they will still include export-data records (hashes) for all snapshots. (Deleting all the export-data records is too much work to do in a single step, so it is spread out over time). + +Later, as the kernel performs cleanup work for this vatID, the cleanup call will delete DB rows (one per `budget`). Each row deleted will also remove one export-data record, which feeds the callback queue, as well as affecting the full `getExportData()` results. + +Eventually, the snapStore runs out of rows to delete, and `deleteVatSnapshots(budget)` returns `{ done: true }`, so the kernel can finally rest. + +### SnapStore Vat Lifetime + +The SnapStore doesn't provide an explicit API to call when a vat is first created. The kernel just calls `saveSnapshot()` for both the first and all subsequent snapshots. Each `saveSnapshot()` marks the previous snapshot as unused, so there is at most one `inUse = 1` snapshot at any time. There will be zero in-use snapshots just after each incarnation starts, until enough deliveries have been made to trigger the first snapshot. + +When terminating a vat, the kernel should first call `snapStore.stopUsingLastSnapshot(vatID)`, the same call it would make at the end of an incarnation, to indicate that we're no longer using the last snapshot. This results in zero in-use snapshots. + +Then, the kernel must either call `snapStore.deleteVatSnapshots(vatID, undefined)` to delete everything at once, or make a series of calls (spread out over time/blocks) to `snapStore.deleteVatSnapshots(vatID, budget)`. Each will return `{ done, cleanups }`, which can be used to manage the rate-limiting and know when the process is finished. + +The `stopUsingLastSnapshot()` is a performance improvement, but is not mandatory. If omitted, exports will continue to include the vat's snapshot artifacts until the first call to `deleteVatSnapshots()`, after which they will go away. Snapshots are deleted in descending `snapPos` order, so the first call will delete the only `inUse = 1` snapshot, after which exports will omit all artifacts for the vatID. `stopUsingLastSnapshot()` is idempotent, and extra calls will leave the DB unchanged. + +The kernel must keep calling `deleteVatSnapshots(vatID, budget)` until the `{ done }` return value is `true`. It is safe to call it again after that point; the function will keep returning `true`. But note, this costs one DB txn, so it may be cheaper for the kernel to somehow remember that we've reached the end. diff --git a/packages/swing-store/docs/swingstore.md b/packages/swing-store/docs/swingstore.md index 56bd174dd28..b6cde0a45c1 100644 --- a/packages/swing-store/docs/swingstore.md +++ b/packages/swing-store/docs/swingstore.md @@ -1,13 +1,57 @@ -# SwingStore Data Model +# The SwingStore The "SwingStore" provides a database to hold SwingSet kernel state, with an API crafted to help both the kernel and the host application mutate, commit, export, and import this state. -The state is broken up into several pieces, or "stores": +The entire durable state of the kernel lives in the SwingStore: it does not use any other files or databases, and the only commit point is in `hostStorage.commit()`. Careful host applications can use this to avoid "hangover inconsistency", by storing all device output messages in the same database, and only releasing them once the kernel changes have been committed. + +In theory, an alternate implementation of this API could be provided with e.g. a different backend database, such as the host application's own native database (eg IAVL, for cosmos-sdk -based apps). This could simplify the atomicity domains by using just one database instead of two. This must be balanced against performance tradeoffs: swing-store takes advantage of SQL's indexing and iteration abilities, which might not be present in the other database. + +## Creating and Opening a SwingStore + + +`initSwingStore(dirPath, options)` will create a new swingstore in the given directory, which will be created if it doesn't already exist. The entire directory is reserved for the swingstore: the host application should not put any other files there. The swingstore library will populated it with the SQLite DB's backing files: `swingstore.sqlite`, `swingstore.sqlite-wal`, and `swingstore.sqlite-shm`. If called on a directory that already contains a database, the DB will be erased first. + +`openSwingStore(dirPath, options)` does the same, but will not erase a pre-existing DB. In general, use `initSwingStore` for the initial creation of the DB, and `openSwingStore` for all subsequent access. + +Both calls return a record with `{ hostStorage, kernelStorage }`, along with some additional facets for testing and debugging. `dirPath` can be null to use a ephemeral (in-memory) DB, which is only useful for unit tests. + +## HostStorage + +The `hostStorage` facet is reserved for the host application. It is mostly used to manage commit points for the application. + +The host is responsible for calling `hostStorage.commit()` when it is done with kernel execution. This causes a SQLite `COMMIT` of the underlying database. It should perform this commit before it releases any device output messages. This facet is the only one with a `commit()` method: the kernel is explicitly unable to commit its own changes to the underlying SQLite database, because the kernel does not know anything about the host's application lifecycle or input/output activity, so it cannot know what qualifies as a safe commit point. + +If, for some reason, the host wants to abandon execution, it can call `hostStorage.close()`, which will close the swingstore without committing any changes. This is not normally useful: the kernel must be abandoned at this point too, so most of the time the host application should just exit entirely. + +`hostStorage.kvStore` is also available to let the host add items to a separate portion of the kvStore, using keys which start with a `host.` prefix. It can use this to coordinate with a separately-committed host database (e.g. to remember how much work has been given to the kernel, and how much has been successfully executed). This portion of the kvStore is unreachable by the kernel. + +`hostStorage.setExportCallback()` is used to register an export callback after swingstore creation, see [data-export.md](./data-export.md) for details. Most applications will instead provide `options.exportCallback` to `openSwingStore()`. + +`hostStorage.repairMetadata()` was used to repair a historical flaw in the database format, and is not needed by new installations. + +## KernelStorage + +The host application is supposed to deliver the `kernelStorage` facet to the kernel, by passing it into `initializeSwingset()`, `upgradeSwingset()`, and `buildVatController()`. The host application should not use `kernelStorage` itself. + +The kernel receives a facet named `kernelStorage`, from which it can access four sub-stores: + +* [`bundleStore`](./bundlestore.md): a string-keyed Bundle-value table, holding source bundles which can be evaluated by `importBundle` to create vats, or new Compartments within a vat +* [`transcriptStore`](./transcriptstore.md): records a linear sequence of deliveries and syscalls (with results), collectively known as "transcript entries", for each vat +* [`snapStore`](./snapstore.md): records one or more XS heap snapshots for each vat, to rebuild a worker more efficiently than replaying all transcript entries from the beginning +* [`kvStore`](./kvstore.md): a string-keyed string-valued table, which holds everything else. Currently, this holds each vat's c-list and vatstore data, as well as the kernel-wide object and promise tables, and run-queues. + +These pieces operate independently: data in one substore does not affect the operation of the others. + +`kernelStorage` also provides access to the "crank" tools. Kernel execution proceeds in a series of steps named "cranks", many of which involve delivering a message to a vat worker. Sometimes these messages cause a failure halfway through the delivery, where it is better to record either complete deliveries or nothing at all. To support this, the kernel can mark the beginning of the crank (by calling `kernelStorage.startCrank()`), and then either discard the changes (`rollbackCrank()`) or accept them (`endCrank()`). The `emitCrankHashes()` method rotates the crankhash and updates the activityhash (see the kvStore documentation for details). + +Note that `endCrank()` does *not* perform a SQLite `COMMIT`, as that power is reserved for the host application (through `hostStorage.commit()`). Instead, the kernel only has access to SQLite "savepoints", which are smaller-scale than full transactions. + + +# SwingStore Data Model + + +The state is broken up into several pieces, or "sub-stores": -* `bundleStore`: a string-keyed Bundle-value table, holding source bundles which can be evaluated by `importBundle` to create vats, or new Compartments within a vat -* `transcriptStore`: records a linear sequence of deliveries and syscalls (with results), collectively known as "transcript entries", for each vat -* `snapStore`: records one or more XS heap snapshots for each vat, to rebuild a worker more efficiently than replaying all transcript entries from the beginning -* `kvStore`: a string-keyed string-valued table, which holds everything else. Currently, this holds each vat's c-list and vatstore data, as well as the kernel-wide object and promise tables, and run-queues. ## Incarnations, Spans, Snapshots @@ -50,3 +94,15 @@ When a transcript span is pruned, the `transcriptSpans` row is left alone, but t During import, we create the metadata first (as the export-data is parsed), then later, we fill in the details as the artifacts are read. Bundles are never pruned, however during import, the `bundles` table will temporarily contain rows whose `bundle` BLOB is NULL. + +## Vat Lifetimes + +Two sub-stores are keyed by VatID: `transcriptStore` and `snapStore` (the `bundleStore` does not know which vats might know about each bundle, and the `kvStore` entries which relate to a specific vat will have the VatID embedded in the key, so the swing-store doesn't need to know about them). + +When the kernel terminates a vat, we want to delete the no-longer-necessary data. However, if the vat had a large number of transcript entries and/or heap snapshots, deleting all this data at the same time might cause excessing CPU or I/O usage (eg thousands of DB queries, or a multi-gigabyte `swingstore.sqlite-wal` file. It might also push a large number of changes into the export-data callbacks, which can cause memory or CPU stall problems in the host application. In the worst case, the entire application could crash. + +To limit this usage, and allow the kernel to delete vat state slowly, the swing-store is somewhat more aware of a vat's lifetime than a mere database should be. In particular, we split the shutdown process into two pieces. "Terminating a vat" happens first, and tells the sub-store to hide the vat from exports and from API calls that are meant to find out which vats are available. The kernel should call this exactly once, when the vat is terminated. + +The second part is "deletion", and it can happen either all-at-once or in multiple budget-limited calls. Both forms share the same API calls, differing only in their `budget` argument (`undefined` means all-at-once). The deletion API can be called multiple times, with a small budget, and each call will only delete a small portion of the state. They will return a value that indicates when the last bit of state has been deleted, so the kernel can know when to stop calling them. + +See [transcriptstore.md](./transcriptstore.md) and [snapstore.md](./snapstore.md) for more details. diff --git a/packages/swing-store/docs/transcriptstore.md b/packages/swing-store/docs/transcriptstore.md new file mode 100644 index 00000000000..a3540e02f6f --- /dev/null +++ b/packages/swing-store/docs/transcriptstore.md @@ -0,0 +1,70 @@ +# TranscriptStore + +The `kernelStorage.transcriptStore` sub-store tracks vat delivery transcripts, through which the kernel can provide orthogonal persistence of JavaScript runtime environments (vats). + +Each vat is a JavaScript runtime environment, initialized by evaluating some starting code bundle, and then fed a series of deliveries. Each delivery may provoke some number of syscalls back to the kernel, with each get some response data. The delivery finishes with a "delivery result". + +For each delivery, this data (delivery, syscall/response pairs, delivery-result) is serialized and stored in a single "transcript item". Each item is indexed by an incrementing "delivery number" (`deliveryNum`). + +When a vat worker is brought online, the kernel retrieves these transcript items from the transcriptStore and replays them, by performing the delivery and responding to the syscalls, even though the syscall responses are pulled from the transcript instead of causing actual execution. The kernel asserts that the new worker behaves exactly like the original one did. For xsnap workers, the kernel doesn't actually have to replay the *entire* transcript, because it can start from a heap snapshot (stored in the adjoining [`snapStore`](./snapstore.md)). So generally it only needs to replay a single span. + +## Data Model + +Vat lifetimes are broken up into "incarnations", separated by upgrade events. Within each incarnation, the transcript is broken up into "spans", separated by heap-snapshot cycles. To end a span, the kernel records the worker's heap snapshot, then "closes" the old span, and opens a new one. + +This results in a single open or "current" span for each active vat, and a series of historical spans. For operational purposes, we only care about the current span. But to support some potential deep-replay needs, the transcriptStore can retain data about earlier spans. + +The SQLite database has one table that tracks transcript spans, named `transcriptSpans`. All vatIDs and incarnations are stored in the same table, whose schema is `(vatID TEXT, startPos INTEGER, endPos INTEGER, hash TEXT, isCurrent INTEGER, incarnation INTEGER)`. `startPos` and `endPos` define a zero-based range over the sequence of all deliveries into a vat (the former inclusive and the latter exclusive, such that e.g. `startPos=0` and `endPos=3` would encompass the first three deliveries, with positions 0, 1, and 2). + +A separate table named `transcriptItems` tracks the items themselves, with a schema of `(vatID TEXT, position INTEGER, item TEXT, incarnation INTEGER)`. This table has one row per transcript item, each of which is "owned" by a single span with matching values for `vatID` and `incarnation` and having `startPos <= position` and `endPos > position`. Each span owns multiple items (typically 200, but it depends upon how frequently the kernel rolls over new spans). + +In the future, historical spans may be compressed, and their item rows replaced with a single compressed blob in the span record. This would reduce the space needed without actually pruning the data. + +## Retention / Pruning + +If the current swingstore was opened with the `keepTranscripts = false` option, then the transcriptStore will "prune" each span as soon as it becomes historical. Pruned spans will still have a span record, with a hash, to enable safely-validated restoration of the transcript items later, if necessary. However their item records will be deleted, to save space. + +When `keepTranscripts = true`, all span items are retained. + +Pruned spans are not available for export artifacts, of course, because the data is missing. However the span *hashes* are still included in the export-data, to support safe validation. You can start with a pruned swingstore, produce an export dataset, import that dataset into a new swingstore, and the new swingstore will be just as capable of validating replacement span records as the original was. + +## Export Model + +Every transcript span, both current and historic, gets an export-data record. The record name is different for the two types of spans. + +Historical spans, which are "closed" and no longer growing, use a record name of +`transcript.${vatID}.${startPos}`, where `startPos` is the delivery number of the first delivery included in the span. The value is a JSON-serialized record of `{ vatID, startPos, endPos, hash, isCurrent, incarnation }` (where `isCurrent = 0`). + +The current span, if any, uses a record name of `transcript.${vatID}.current`, and has the same value as historical spans (except `isCurrent = 1`). Current spans are growing: new transcript items are added as more deliveries are made, until the span is closed off (becomes historical) and replaced with a new current span. There is at most one current span per vatID. + +The available export *artifacts* will depend upon the export mode, and upon the swingstore's `keepTranscripts` setting. Each export artifact corresponds to a single span, and the artifact names are always `transcript.${vatID}.${startPos}.${endPos}` (for both historical and current spans). + +In the most-minimal `operational` mode, the export includes one artifact for each active (non-terminated) vat: just the current span. If `keepTranscripts` is true, these will be the only available artifacts anyways. + +The `replay` mode includes all spans for each vat's current incarnation, but omits spans from earlier incarnations. The `archival` mode includes all spans from all incarnations. + +The `debug` mode includes all available spans, even for terminated vats. For the non-`debug` modes, terminated vats will not provide export-data or artifacts. + +## Slow Deletion + +As soon as a vat is terminated, the kernel will call `transcriptStore.stopUsingTranscript()`. The DB is updated to clear the `isCurrent` flag of the latest span, leaving no rows with `isCurrent = 1`. This immediately makes the vat non-loadable by the kernel. + +This also removes the `transcript.${vatID}.current` export-data record, and replaces it with a `transcript.${vatID}.${startPos}` one, effectively making the span historical. This change (one deletion, one addition) is added to the export-data callback queue, so the host-app can learn about it after the next commit, and any subsequent `getExportData()` calls will see the replacement record, instead of a `.current` record. + +At this point, all non-`debug` swing-store exports after this point will omit any artifacts for the vat, but they will still include export-data records (hashes) for all spans, all of which look historical. (Deleting all the span records, and their corresponding export-data records, is too much work to do in a single step). + +Later, as the kernel performs cleanup work for this vatID, the `transcriptStore.deleteVatTranscripts(budget)` cleanup call will delete one span row per `budget`, along with all related item rows (typically 200). Each span deleted will also remove one export-data record (which feeds the callback queue, as well as affecting the full `getExportData()` results). + +Eventually, the transcriptStore runs out of rows to delete, and `deleteVatTranscripts(budget)` returns `{ done: true }`, so the kernel can finally rest. + +### TranscriptStore Vat Lifetime + +Unlike the [SnapStore](./snapstore.md), the TranscriptStore *does* have an explicit call to be made when a vat is first created: `transcriptStore.initTranscript(vatID)`. Also unlike SnapStore, TranscriptStore (normally) always has an `isCurrent = 1` span for each vat (it might just be empty of items, immediately after the span rolls over). + +When a vat is terminated, the kernel should first call `transcriptStore.stopUsingTranscript(vatID)`. This will mark the single current span as `isCurrent = 0`. The kernel must not attempt to read, add, or rollover spans or items while in this state. While in this state, exports (export for `mode = debug`) will not emit artifacts for this VatID: export-data records will still exist for all spans, as these must be deleted slowly, however there will be no associated artifacts or artifact names. + +Then, the kernel should either call `transcriptStore.deleteVatTranscripts(vatID, undefined)` exactly once, or it should call `transcriptStore.deleteVatTranscripts(vatID, budget)` until it returns `{ done: true }`. + +As with snapshots, the `stopUsingTranscript()` is a non-mandatory performance improvement. If omitted, exports will continue to include (many) span artifacts for this vat until the first call to `deleteVatTranscripts()` removes the one `isCurrent = 1` span (since spans are deleted most-recent-first). After that point, exports will stop including any artifacts for the vatID. `stopUsingTranscript()` is idempotent, and extra calls will leave the DB unchanged. + +The kernel must keep calling `deleteVatTranscripts(vatID, budget)` until the `{ done }` return value is `true`. As with the SnapStore, it is safe to call it again after that point; the function will keep returning `true`. diff --git a/packages/swing-store/src/snapStore.js b/packages/swing-store/src/snapStore.js index c96a49c0343..20e13373f9f 100644 --- a/packages/swing-store/src/snapStore.js +++ b/packages/swing-store/src/snapStore.js @@ -39,7 +39,7 @@ import { buffer } from './util.js'; * loadSnapshot: (vatID: string) => AsyncIterableIterator, * saveSnapshot: (vatID: string, snapPos: number, snapshotStream: AsyncIterable) => Promise, * deleteAllUnusedSnapshots: () => void, - * deleteVatSnapshots: (vatID: string) => void, + * deleteVatSnapshots: (vatID: string, budget?: number) => { done: boolean, cleanups: number }, * stopUsingLastSnapshot: (vatID: string) => void, * getSnapshotInfo: (vatID: string) => SnapshotInfo, * }} SnapStore @@ -173,11 +173,13 @@ export function makeSnapStore( `); function stopUsingLastSnapshot(vatID) { + // idempotent ensureTxn(); const oldInfo = sqlGetPriorSnapshotInfo.get(vatID); if (oldInfo) { const rec = snapshotRec(vatID, oldInfo.snapPos, oldInfo.hash, 0); noteExport(snapshotMetadataKey(rec), JSON.stringify(rec)); + noteExport(currentSnapshotMetadataKey(rec), undefined); if (keepSnapshots) { sqlStopUsingLastSnapshot.run(vatID); } else { @@ -354,28 +356,74 @@ export function makeSnapStore( WHERE vatID = ? `); + const sqlDeleteOneVatSnapshot = db.prepare(` + DELETE FROM snapshots + WHERE vatID = ? AND snapPos = ? + `); + const sqlGetSnapshotList = db.prepare(` SELECT snapPos FROM snapshots WHERE vatID = ? ORDER BY snapPos `); - sqlGetSnapshotList.pluck(true); + + const sqlGetSnapshotListLimited = db.prepare(` + SELECT snapPos, inUse + FROM snapshots + WHERE vatID = ? + ORDER BY snapPos DESC + LIMIT ? + `); /** - * Delete all snapshots for a given vat (for use when, e.g., a vat is terminated) + * @param {string} vatID + * @returns {boolean} + */ + function hasSnapshots(vatID) { + // the LIMIT 1 means we aren't really getting all entries + return sqlGetSnapshotListLimited.all(vatID, 1).length > 0; + } + + /** + * Delete some or all snapshots for a given vat (for use when, e.g., + * a vat is terminated) * * @param {string} vatID + * @param {number} [budget] + * @returns {{ done: boolean, cleanups: number }} */ - function deleteVatSnapshots(vatID) { + function deleteVatSnapshots(vatID, budget = undefined) { ensureTxn(); - const deletions = sqlGetSnapshotList.all(vatID); - for (const snapPos of deletions) { + const deleteAll = budget === undefined; + assert(deleteAll || budget >= 1, 'budget must be undefined or positive'); + // We can't use .iterate because noteExport can write to the DB, + // and overlapping queries are not supported. + const deletions = deleteAll + ? sqlGetSnapshotList.all(vatID) + : sqlGetSnapshotListLimited.all(vatID, budget); + let clearCurrent = deleteAll; + for (const deletion of deletions) { + clearCurrent ||= deletion.inUse; + const { snapPos } = deletion; const exportRec = snapshotRec(vatID, snapPos, undefined); noteExport(snapshotMetadataKey(exportRec), undefined); + // Budgeted deletion must delete rows one by one, + // but full deletion is handled all at once after this loop. + if (!deleteAll) { + sqlDeleteOneVatSnapshot.run(vatID, snapPos); + } + } + if (deleteAll) { + sqlDeleteVatSnapshots.run(vatID); + } + if (clearCurrent) { + noteExport(currentSnapshotMetadataKey({ vatID }), undefined); } - noteExport(currentSnapshotMetadataKey({ vatID }), undefined); - sqlDeleteVatSnapshots.run(vatID); + return { + done: deleteAll || deletions.length === 0 || !hasSnapshots(vatID), + cleanups: deletions.length, + }; } const sqlGetSnapshotInfo = db.prepare(` @@ -452,7 +500,7 @@ export function makeSnapStore( `); /** - * Obtain artifact metadata records for spanshots contained in this store. + * Obtain artifact metadata records for snapshots contained in this store. * * @param {boolean} includeHistorical If true, include all metadata that is * present in the store regardless of its currency; if false, only include diff --git a/packages/swing-store/src/swingStore.js b/packages/swing-store/src/swingStore.js index ccf9c200687..40c72b15644 100644 --- a/packages/swing-store/src/swingStore.js +++ b/packages/swing-store/src/swingStore.js @@ -554,6 +554,7 @@ export function makeSwingStore(dirPath, forceReset, options = {}) { getCurrentSpanBounds: transcriptStore.getCurrentSpanBounds, addItem: transcriptStore.addItem, readSpan: transcriptStore.readSpan, + stopUsingTranscript: transcriptStore.stopUsingTranscript, deleteVatTranscripts: transcriptStore.deleteVatTranscripts, }; diff --git a/packages/swing-store/src/transcriptStore.js b/packages/swing-store/src/transcriptStore.js index adbcb71e5d1..1a10e897d1d 100644 --- a/packages/swing-store/src/transcriptStore.js +++ b/packages/swing-store/src/transcriptStore.js @@ -18,7 +18,8 @@ import { createSHA256 } from './hasher.js'; * rolloverSpan: (vatID: string) => number, * rolloverIncarnation: (vatID: string) => number, * getCurrentSpanBounds: (vatID: string) => { startPos: number, endPos: number, hash: string, incarnation: number }, - * deleteVatTranscripts: (vatID: string) => void, + * stopUsingTranscript: (vatID: string) => void, + * deleteVatTranscripts: (vatID: string, budget?: number) => { done: boolean, cleanups: number }, * addItem: (vatID: string, item: string) => void, * readSpan: (vatID: string, startPos?: number) => IterableIterator, * }} TranscriptStore @@ -214,7 +215,7 @@ export function makeTranscriptStore( ensureTxn(); const initialIncarnation = 0; sqlWriteSpan.run(vatID, 0, 0, initialHash, 1, initialIncarnation); - const newRec = spanRec(vatID, 0, 0, initialHash, 1, 0); + const newRec = spanRec(vatID, 0, 0, initialHash, true, 0); noteExport(spanMetadataKey(newRec), JSON.stringify(newRec)); } @@ -251,21 +252,35 @@ export function makeTranscriptStore( function doSpanRollover(vatID, isNewIncarnation) { ensureTxn(); const { hash, startPos, endPos, incarnation } = getCurrentSpanBounds(vatID); - const rec = spanRec(vatID, startPos, endPos, hash, 0, incarnation); + const rec = spanRec(vatID, startPos, endPos, hash, false, incarnation); + + // add a new record for the now-old span noteExport(spanMetadataKey(rec), JSON.stringify(rec)); + + // and change its DB row to isCurrent=0 sqlEndCurrentSpan.run(vatID); + + // create a new (empty) row, with isCurrent=1 const incarnationToUse = isNewIncarnation ? incarnation + 1 : incarnation; sqlWriteSpan.run(vatID, endPos, endPos, initialHash, 1, incarnationToUse); + + // overwrite the transcript.${vatID}.current record with new span const newRec = spanRec( vatID, endPos, endPos, initialHash, - 1, + true, incarnationToUse, ); noteExport(spanMetadataKey(newRec), JSON.stringify(newRec)); + if (!keepTranscripts) { + // TODO: for #9174 (delete historical transcript spans), we need + // this DB statement to only delete the items of the old span + // (startPos..endPos), not all previous items, otherwise the + // first rollover after switching to keepTranscripts=false will + // do a huge DB commit and probably explode sqlDeleteOldItems.run(vatID, endPos); } return incarnationToUse; @@ -314,19 +329,101 @@ export function makeTranscriptStore( ORDER BY startPos `); + // This query is ORDER BY startPos DESC, so deleteVatTranscripts + // will delete the newest spans first. If the kernel failed to call + // stopUsingTranscript, that will delete the isCurrent=1 span first, + // which lets us stop including span artifacts in exports sooner. + + const sqlGetSomeVatSpans = db.prepare(` + SELECT vatID, startPos, endPos, isCurrent + FROM transcriptSpans + WHERE vatID = ? + ORDER BY startPos DESC + LIMIT ? + `); + + const sqlDeleteVatSpan = db.prepare(` + DELETE FROM transcriptSpans + WHERE vatID = ? AND startPos = ? + `); + + const sqlDeleteSomeItems = db.prepare(` + DELETE FROM transcriptItems + WHERE vatID = ? AND position >= ? AND position < ? + `); + + /** + * Prepare for vat deletion by marking the isCurrent span as not + * current. Idempotent. + * + * @param {string} vatID The vat being terminated/deleted. + */ + function stopUsingTranscript(vatID) { + ensureTxn(); + // this transforms the current span into a (short) historical one + // (basically doSpanRollover without adding replacement data) + const bounds = sqlGetCurrentSpanBounds.get(vatID); + if (bounds) { + // add a new record for the now-old span + const { startPos, endPos, hash, incarnation } = bounds; + const rec = spanRec(vatID, startPos, endPos, hash, false, incarnation); + noteExport(spanMetadataKey(rec), JSON.stringify(rec)); + + // and change its DB row to isCurrent=0 + sqlEndCurrentSpan.run(vatID); + + // remove the transcript.${vatID}.current record + noteExport(spanMetadataKey({ vatID, isCurrent: true }), undefined); + } + } + + /** + * @param {string} vatID + * @returns {boolean} + */ + function hasSpans(vatID) { + // the LIMIT 1 means we aren't really getting all spans + return sqlGetSomeVatSpans.all(vatID, 1).length > 0; + } + /** - * Delete all transcript data for a given vat (for use when, e.g., a vat is terminated) + * Delete some or all transcript data for a given vat (for use when, + * e.g., a vat is terminated) * * @param {string} vatID + * @param {number} [budget] + * @returns {{ done: boolean, cleanups: number }} */ - function deleteVatTranscripts(vatID) { + function deleteVatTranscripts(vatID, budget = undefined) { ensureTxn(); - const deletions = sqlGetVatSpans.all(vatID); + const deleteAll = budget === undefined; + assert(deleteAll || budget >= 1, 'budget must be undefined or positive'); + // We can't use .iterate because noteExport can write to the DB, + // and overlapping queries are not supported. + const deletions = deleteAll + ? sqlGetVatSpans.all(vatID) + : sqlGetSomeVatSpans.all(vatID, budget); for (const rec of deletions) { + // If rec.isCurrent is true, this will remove the + // transcript.$vatID.current export-data record. If false, it + // will remove the transcript.$vatID.$startPos record. noteExport(spanMetadataKey(rec), undefined); + + // Budgeted deletion must delete rows one by one, + // but full deletion is handled all at once after this loop. + if (!deleteAll) { + sqlDeleteVatSpan.run(vatID, rec.startPos); + sqlDeleteSomeItems.run(vatID, rec.startPos, rec.endPos); + } + } + if (deleteAll) { + sqlDeleteVatItems.run(vatID); + sqlDeleteVatSpans.run(vatID); } - sqlDeleteVatItems.run(vatID); - sqlDeleteVatSpans.run(vatID); + return { + done: deleteAll || deletions.length === 0 || !hasSpans(vatID), + cleanups: deletions.length, + }; } const sqlGetAllSpanMetadata = db.prepare(` @@ -379,6 +476,12 @@ export function makeTranscriptStore( * The only code path which could use 'false' would be `swingstore.dump()`, * which takes the same flag. * + * Note that when a vat is terminated and has been partially + * deleted, we will retain (and return) a subset of the metadata + * records, because they must be deleted in-consensus and with + * updates to the noteExport hook. But we don't create any artifacts + * for the terminated vats, even for the spans that remain, + * * @yields {readonly [key: string, value: string]} * @returns {IterableIterator} * An iterator over pairs of [spanMetadataKey, rec], where `rec` is a @@ -432,9 +535,16 @@ export function makeTranscriptStore( } } } else if (artifactMode === 'archival') { - // everything + // every span for all vatIDs that have an isCurrent span (to + // ignore terminated/partially-deleted vats) + const vatIDs = new Set(); + for (const { vatID } of sqlGetCurrentSpanMetadata.iterate()) { + vatIDs.add(vatID); + } for (const rec of sqlGetAllSpanMetadata.iterate()) { - yield spanArtifactName(rec); + if (vatIDs.has(rec.vatID)) { + yield spanArtifactName(rec); + } } } else if (artifactMode === 'debug') { // everything that is a complete span @@ -572,7 +682,7 @@ export function makeTranscriptStore( const newEndPos = endPos + 1; const newHash = updateSpanHash(hash, item); sqlUpdateSpan.run(newEndPos, newHash, vatID); - const rec = spanRec(vatID, startPos, newEndPos, newHash, 1, incarnation); + const rec = spanRec(vatID, startPos, newEndPos, newHash, true, incarnation); noteExport(spanMetadataKey(rec), JSON.stringify(rec)); }; @@ -774,6 +884,7 @@ export function makeTranscriptStore( getCurrentSpanBounds, addItem, readSpan, + stopUsingTranscript, deleteVatTranscripts, exportSpan, diff --git a/packages/swing-store/test/deletion.test.js b/packages/swing-store/test/deletion.test.js index 58e08bf727b..e749fe92ba7 100644 --- a/packages/swing-store/test/deletion.test.js +++ b/packages/swing-store/test/deletion.test.js @@ -1,28 +1,50 @@ // @ts-check import test from 'ava'; +import path from 'path'; + import { Buffer } from 'node:buffer'; +import sqlite3 from 'better-sqlite3'; +import { tmpDir } from './util.js'; import { initSwingStore } from '../src/swingStore.js'; +import { makeSwingStoreExporter } from '../src/exporter.js'; +import { importSwingStore } from '../src/importer.js'; async function* getSnapshotStream() { yield Buffer.from('abc'); } harden(getSnapshotStream); +// update 'data' with the callback deltas to get a new current +// export-data record +const mergeExportDeltas = (data, exports) => { + for (const [key, value] of exports) { + if (value) { + data[key] = value; + } else { + delete data[key]; + } + } +}; + +const mapToObj = map => Object.fromEntries(map.entries()); + test('delete snapshots with export callback', async t => { const exportLog = []; + const exportData = {}; const exportCallback = exports => { for (const [key, value] of exports) { exportLog.push([key, value]); } + mergeExportDeltas(exportData, exports); }; const store = initSwingStore(null, { exportCallback }); const { kernelStorage, hostStorage } = store; const { snapStore } = kernelStorage; const { commit } = hostStorage; - - await snapStore.saveSnapshot('v1', 10, getSnapshotStream()); - await snapStore.saveSnapshot('v1', 11, getSnapshotStream()); - await snapStore.saveSnapshot('v1', 12, getSnapshotStream()); + const vatID = 'v1'; + await snapStore.saveSnapshot(vatID, 10, getSnapshotStream()); + await snapStore.saveSnapshot(vatID, 11, getSnapshotStream()); + await snapStore.saveSnapshot(vatID, 12, getSnapshotStream()); // nothing is written to exportCallback until endCrank() or commit() t.deepEqual(exportLog, []); @@ -33,11 +55,18 @@ test('delete snapshots with export callback', async t => { t.is(exportLog[1][0], 'snapshot.v1.11'); t.is(exportLog[2][0], 'snapshot.v1.12'); t.is(exportLog[3][0], 'snapshot.v1.current'); + const hash = JSON.parse(exportLog[0][1]).hash; + t.deepEqual(exportData, { + 'snapshot.v1.10': JSON.stringify({ vatID, snapPos: 10, hash, inUse: 0 }), + 'snapshot.v1.11': JSON.stringify({ vatID, snapPos: 11, hash, inUse: 0 }), + 'snapshot.v1.12': JSON.stringify({ vatID, snapPos: 12, hash, inUse: 1 }), + 'snapshot.v1.current': 'snapshot.v1.12', + }); exportLog.length = 0; // in a previous version, deleteVatSnapshots caused overlapping SQL // queries, and failed - snapStore.deleteVatSnapshots('v1'); + snapStore.deleteVatSnapshots(vatID); await commit(); t.deepEqual(exportLog, [ @@ -47,6 +76,7 @@ test('delete snapshots with export callback', async t => { ['snapshot.v1.current', null], ]); exportLog.length = 0; + t.deepEqual(exportData, {}); }); test('delete transcripts with export callback', async t => { @@ -91,3 +121,546 @@ test('delete transcripts with export callback', async t => { exportLog.length = 0; }); + +const getExport = async (dbDir, artifactMode) => { + const exporter = makeSwingStoreExporter(dbDir, { artifactMode }); + const exportData = new Map(); + for await (const [key, value] of exporter.getExportData()) { + exportData.set(key, value); + } + const artifactNames = []; + for await (const name of exporter.getArtifactNames()) { + artifactNames.push(name); + } + await exporter.close(); + return { exportData, artifactNames }; +}; + +const reImport = async (t, dbDir, artifactMode) => { + const [dbDir2, cleanup] = await tmpDir('testdb2'); + t.teardown(cleanup); + const exporter = makeSwingStoreExporter(dbDir, { artifactMode }); + const ss2 = await importSwingStore(exporter, dbDir2, { artifactMode }); + await ss2.hostStorage.commit(); + return sqlite3(path.join(dbDir2, 'swingstore.sqlite')); +}; + +const compareNoHash = (t, obj1, obj2) => { + const o1 = {}; + for (const [key, value] of Object.entries(obj1)) { + const { hash: _, ...data } = JSON.parse(value); + o1[key] = data; + } + return t.deepEqual(o1, obj2); +}; + +const setupTranscript = async t => { + const vatID = 'v1'; + const exportLog = []; + const currentExportData = {}; + const exportCallback = exports => { + for (const [key, value] of exports) { + exportLog.push([key, value]); + } + mergeExportDeltas(currentExportData, exports); + }; + const [dbDir, cleanup] = await tmpDir('testdb'); + t.teardown(cleanup); + const store = initSwingStore(dbDir, { exportCallback }); + const { kernelStorage, hostStorage } = store; + const { transcriptStore } = kernelStorage; + const { commit } = hostStorage; + // look directly at DB to confirm changes + const db = sqlite3(path.join(dbDir, 'swingstore.sqlite')); + + // two incarnations, two spans each + transcriptStore.initTranscript(vatID); + transcriptStore.addItem(vatID, 'aaa'); + transcriptStore.addItem(vatID, 'bbb'); + transcriptStore.rolloverSpan(vatID); + transcriptStore.addItem(vatID, 'ccc'); + transcriptStore.addItem(vatID, 'ddd'); + transcriptStore.rolloverIncarnation(vatID); + transcriptStore.addItem(vatID, 'eee'); + transcriptStore.addItem(vatID, 'fff'); + transcriptStore.rolloverSpan(vatID); + transcriptStore.addItem(vatID, 'ggg'); + transcriptStore.addItem(vatID, 'hhh'); + await commit(); + + return { + db, + dbDir, + commit, + transcriptStore, + exportLog, + currentExportData, + vatID, + }; +}; + +test('slow deletion of transcripts', async t => { + // slow transcript deletion should remove export-data as it removes + // transcript spans and their items + + const { + db, + dbDir, + commit, + transcriptStore, + exportLog, + currentExportData, + vatID, + } = await setupTranscript(t); + + t.is(exportLog.length, 4); + t.is(exportLog[0][0], 'transcript.v1.0'); + t.is(exportLog[1][0], 'transcript.v1.2'); + t.is(exportLog[2][0], 'transcript.v1.4'); + t.is(exportLog[3][0], 'transcript.v1.current'); + exportLog.length = 0; + const t0 = { vatID, startPos: 0, endPos: 2, isCurrent: 0, incarnation: 0 }; + const t2 = { vatID, startPos: 2, endPos: 4, isCurrent: 0, incarnation: 0 }; + const t4 = { vatID, startPos: 4, endPos: 6, isCurrent: 0, incarnation: 1 }; + const tc = { vatID, startPos: 6, endPos: 8, isCurrent: 1, incarnation: 1 }; + const t6 = { vatID, startPos: 6, endPos: 8, isCurrent: 0, incarnation: 1 }; + compareNoHash(t, currentExportData, { + 'transcript.v1.0': t0, + 'transcript.v1.2': t2, + 'transcript.v1.4': t4, + 'transcript.v1.current': tc, + }); + + t.is(db.prepare('SELECT COUNT(*) FROM transcriptItems').pluck().get(), 8); + t.is(db.prepare('SELECT COUNT(*) FROM transcriptSpans').pluck().get(), 4); + + // an "operational"-mode export should list all spans, but only have + // artifacts for the current one + { + const { exportData, artifactNames } = await getExport(dbDir, 'operational'); + t.deepEqual(currentExportData, mapToObj(exportData)); + compareNoHash(t, mapToObj(exportData), { + 'transcript.v1.0': t0, + 'transcript.v1.2': t2, + 'transcript.v1.4': t4, + 'transcript.v1.current': tc, + }); + t.deepEqual(artifactNames, ['transcript.v1.6.8']); + const db2 = await reImport(t, dbDir, 'operational'); + t.is(db2.prepare('SELECT COUNT(*) FROM transcriptItems').pluck().get(), 2); + t.is(db2.prepare('SELECT COUNT(*) FROM transcriptSpans').pluck().get(), 4); + } + + // an "archival"-mode export should list all four spans, with + // artifacts for each + { + const { exportData, artifactNames } = await getExport(dbDir, 'archival'); + compareNoHash(t, mapToObj(exportData), { + 'transcript.v1.0': t0, + 'transcript.v1.2': t2, + 'transcript.v1.4': t4, + 'transcript.v1.current': tc, + }); + t.deepEqual(artifactNames, [ + 'transcript.v1.0.2', + 'transcript.v1.2.4', + 'transcript.v1.4.6', + 'transcript.v1.6.8', + ]); + const db2 = await reImport(t, dbDir, 'archival'); + t.is(db2.prepare('SELECT COUNT(*) FROM transcriptItems').pluck().get(), 8); + t.is(db2.prepare('SELECT COUNT(*) FROM transcriptSpans').pluck().get(), 4); + } + + // prepare for deletion, this adds a new "closed" record, and + // deletes the .current record (i.e. it transforms .current into a + // closed record) + { + transcriptStore.stopUsingTranscript(vatID); + await commit(); + compareNoHash(t, currentExportData, { + 'transcript.v1.0': t0, + 'transcript.v1.2': t2, + 'transcript.v1.4': t4, + 'transcript.v1.6': t6, + }); + exportLog.length = 0; + // stopUsingTranscript is idempotent + transcriptStore.stopUsingTranscript(vatID); + await commit(); + t.is(exportLog.length, 0); + } + + // All exports (debug and non-debug) in this "terminated but not + // deleted" state will still have the export-data keys. Only + // debug-mode will have artifacts. + for (const mode of ['operational', 'replay', 'archival', 'debug']) { + const { exportData, artifactNames } = await getExport(dbDir, mode); + compareNoHash(t, mapToObj(exportData), { + 'transcript.v1.0': t0, + 'transcript.v1.2': t2, + 'transcript.v1.4': t4, + 'transcript.v1.6': t6, + }); + if (mode === 'debug') { + t.deepEqual(artifactNames, [ + 'transcript.v1.0.2', + 'transcript.v1.2.4', + 'transcript.v1.4.6', + 'transcript.v1.6.8', + ]); + } else { + t.deepEqual(artifactNames, []); + } + const db2 = await reImport(t, dbDir, 'operational'); + t.is(db2.prepare('SELECT COUNT(*) FROM transcriptItems').pluck().get(), 0); + t.is(db2.prepare('SELECT COUNT(*) FROM transcriptSpans').pluck().get(), 4); + } + + // first deletion + { + // budget=1 will let it delete one span, the last one + const dc = transcriptStore.deleteVatTranscripts(vatID, 1); + t.false(dc.done); + t.is(dc.cleanups, 1); + await commit(); + compareNoHash(t, currentExportData, { + 'transcript.v1.0': t0, + 'transcript.v1.2': t2, + 'transcript.v1.4': t4, + }); + t.is(db.prepare('SELECT COUNT(*) FROM transcriptItems').pluck().get(), 6); + t.is(db.prepare('SELECT COUNT(*) FROM transcriptSpans').pluck().get(), 3); + } + + // Exports in this partially-deleted state should be coherent: they + // provide a subset of the older spans (the not-yet-deleted ones, + // all of which have isCurrent=0) and no items (even for + // not-yet-deleted spans). The import-time assertComplete() test + // must be satisfied. + + for (const mode of ['operational', 'replay', 'archival', 'debug']) { + const { exportData, artifactNames } = await getExport(dbDir, mode); + compareNoHash(t, mapToObj(exportData), { + 'transcript.v1.0': t0, + 'transcript.v1.2': t2, + 'transcript.v1.4': t4, + }); + if (mode === 'debug') { + t.deepEqual(artifactNames, [ + 'transcript.v1.0.2', + 'transcript.v1.2.4', + 'transcript.v1.4.6', + ]); + } else { + t.deepEqual(artifactNames, []); + } + const db2 = await reImport(t, dbDir, 'operational'); + t.is(db2.prepare('SELECT COUNT(*) FROM transcriptItems').pluck().get(), 0); + t.is(db2.prepare('SELECT COUNT(*) FROM transcriptSpans').pluck().get(), 3); + } + + // second deletion + { + const dc = transcriptStore.deleteVatTranscripts(vatID, 1); + t.false(dc.done); + t.is(dc.cleanups, 1); + await commit(); + compareNoHash(t, currentExportData, { + 'transcript.v1.0': t0, + 'transcript.v1.2': t2, + }); + t.is(db.prepare('SELECT COUNT(*) FROM transcriptItems').pluck().get(), 4); + t.is(db.prepare('SELECT COUNT(*) FROM transcriptSpans').pluck().get(), 2); + } + + for (const mode of ['operational', 'replay', 'archival', 'debug']) { + const { exportData, artifactNames } = await getExport(dbDir, mode); + compareNoHash(t, mapToObj(exportData), { + 'transcript.v1.0': t0, + 'transcript.v1.2': t2, + }); + if (mode === 'debug') { + t.deepEqual(artifactNames, ['transcript.v1.0.2', 'transcript.v1.2.4']); + } else { + t.deepEqual(artifactNames, []); + } + const db2 = await reImport(t, dbDir, 'operational'); + t.is(db2.prepare('SELECT COUNT(*) FROM transcriptItems').pluck().get(), 0); + t.is(db2.prepare('SELECT COUNT(*) FROM transcriptSpans').pluck().get(), 2); + } + + // last deletion, enough budget to finish + { + const dc = transcriptStore.deleteVatTranscripts(vatID, 5); + t.true(dc.done); + t.is(dc.cleanups, 2); + await commit(); + compareNoHash(t, currentExportData, {}); + t.is(db.prepare('SELECT COUNT(*) FROM transcriptItems').pluck().get(), 0); + t.is(db.prepare('SELECT COUNT(*) FROM transcriptSpans').pluck().get(), 0); + } + + for (const mode of ['operational', 'replay', 'archival', 'debug']) { + const { exportData, artifactNames } = await getExport(dbDir, mode); + compareNoHash(t, mapToObj(exportData), {}); + t.deepEqual(artifactNames, []); + const db2 = await reImport(t, dbDir, 'operational'); + t.is(db2.prepare('SELECT COUNT(*) FROM transcriptItems').pluck().get(), 0); + t.is(db2.prepare('SELECT COUNT(*) FROM transcriptSpans').pluck().get(), 0); + } + + // deleteVatTranscripts is idempotent + { + exportLog.length = 0; + const dc = transcriptStore.deleteVatTranscripts(vatID, 5); + t.true(dc.done); + t.is(dc.cleanups, 0); + await commit(); + t.is(exportLog.length, 0); + } +}); + +test('slow deletion without stopUsingTranscript', async t => { + // slow deletion should work even without stopUsingTranscript + const { dbDir, commit, transcriptStore, currentExportData, vatID } = + await setupTranscript(t); + + // first deletion + { + // budget=1 will let it delete one span, the last one. Because we + // didn't call stopUsingTranscript, this also removes the .current + // record + const dc = transcriptStore.deleteVatTranscripts(vatID, 1); + t.false(dc.done); + t.is(dc.cleanups, 1); + await commit(); + const t0 = { vatID, startPos: 0, endPos: 2, isCurrent: 0, incarnation: 0 }; + const t2 = { vatID, startPos: 2, endPos: 4, isCurrent: 0, incarnation: 0 }; + const t4 = { vatID, startPos: 4, endPos: 6, isCurrent: 0, incarnation: 1 }; + compareNoHash(t, currentExportData, { + 'transcript.v1.0': t0, + 'transcript.v1.2': t2, + 'transcript.v1.4': t4, + }); + const { exportData, artifactNames } = await getExport(dbDir, 'operational'); + t.deepEqual(mapToObj(exportData), currentExportData); + t.deepEqual(artifactNames, []); + } + transcriptStore.deleteVatTranscripts(vatID); + await commit(); + t.deepEqual(currentExportData, {}); + { + const { exportData, artifactNames } = await getExport(dbDir, 'operational'); + t.deepEqual(exportData, new Map()); + t.deepEqual(artifactNames, []); + } +}); + +test('full deletion without stopUsingTranscript', async t => { + // full deletion should work even without stopUsingTranscript + const { dbDir, commit, transcriptStore, currentExportData, vatID } = + await setupTranscript(t); + const dc = transcriptStore.deleteVatTranscripts(vatID); + t.true(dc.done); + await commit(); + t.deepEqual(currentExportData, {}); + const { exportData, artifactNames } = await getExport(dbDir, 'operational'); + t.deepEqual(exportData, new Map()); + t.deepEqual(artifactNames, []); +}); + +const setupSnapshots = async t => { + const vatID = 'v1'; + const exportLog = []; + const currentExportData = {}; + const exportCallback = exports => { + for (const [key, value] of exports) { + exportLog.push([key, value]); + } + mergeExportDeltas(currentExportData, exports); + }; + const [dbDir, cleanup] = await tmpDir('testdb'); + t.teardown(cleanup); + const store = initSwingStore(dbDir, { exportCallback }); + const { kernelStorage, hostStorage } = store; + const { snapStore } = kernelStorage; + const { commit } = hostStorage; + // look directly at DB to confirm changes + const db = sqlite3(path.join(dbDir, 'swingstore.sqlite')); + + await snapStore.saveSnapshot(vatID, 10, getSnapshotStream()); + await snapStore.saveSnapshot(vatID, 11, getSnapshotStream()); + await snapStore.saveSnapshot(vatID, 12, getSnapshotStream()); + // nothing is written to exportCallback until endCrank() or commit() + t.deepEqual(exportLog, []); + await commit(); + const hash = JSON.parse(exportLog[0][1]).hash; + + return { + db, + dbDir, + commit, + snapStore, + exportLog, + currentExportData, + vatID, + hash, + }; +}; + +test('slow deletion of snapshots', async t => { + // slow snapshot deletion should remove export-data as it removes + // snapshots + const { + db, + dbDir, + commit, + snapStore, + exportLog, + currentExportData, + vatID, + hash, + } = await setupSnapshots(t); + t.deepEqual(currentExportData, { + 'snapshot.v1.10': JSON.stringify({ vatID, snapPos: 10, hash, inUse: 0 }), + 'snapshot.v1.11': JSON.stringify({ vatID, snapPos: 11, hash, inUse: 0 }), + 'snapshot.v1.12': JSON.stringify({ vatID, snapPos: 12, hash, inUse: 1 }), + 'snapshot.v1.current': 'snapshot.v1.12', + }); + + t.is(db.prepare('SELECT COUNT(*) FROM snapshots').pluck().get(), 3); + { + // export should mention all spans, with a single current artifact + const { exportData, artifactNames } = await getExport(dbDir, 'operational'); + t.deepEqual(currentExportData, mapToObj(exportData)); + t.is(exportData.get('snapshot.v1.current'), 'snapshot.v1.12'); + t.deepEqual(artifactNames, ['snapshot.v1.12']); + } + + // Prepare for deletion, this clears the .inUse flag on the latest + // record, and deletes the .current record. Exports stop including + // any artifacts. + { + snapStore.stopUsingLastSnapshot(vatID); + await commit(); + t.deepEqual(currentExportData, { + 'snapshot.v1.10': JSON.stringify({ vatID, snapPos: 10, hash, inUse: 0 }), + 'snapshot.v1.11': JSON.stringify({ vatID, snapPos: 11, hash, inUse: 0 }), + 'snapshot.v1.12': JSON.stringify({ vatID, snapPos: 12, hash, inUse: 0 }), + }); + const { exportData, artifactNames } = await getExport(dbDir, 'operational'); + t.deepEqual(currentExportData, mapToObj(exportData)); + t.deepEqual(artifactNames, []); + exportLog.length = 0; + // stopUsingLastSnapshot is idempotent + snapStore.stopUsingLastSnapshot(vatID); + await commit(); + t.is(exportLog.length, 0); + } + + // first deletion + { + // budget=1 will let it delete one snapshot + const dc = snapStore.deleteVatSnapshots(vatID, 1); + t.false(dc.done); + t.is(dc.cleanups, 1); + await commit(); + t.deepEqual(currentExportData, { + 'snapshot.v1.10': JSON.stringify({ vatID, snapPos: 10, hash, inUse: 0 }), + 'snapshot.v1.11': JSON.stringify({ vatID, snapPos: 11, hash, inUse: 0 }), + }); + t.is(db.prepare('SELECT COUNT(*) FROM snapshots').pluck().get(), 2); + // export should mention fewer spans, have no .current or + // artifacts + const { exportData, artifactNames } = await getExport(dbDir, 'operational'); + t.deepEqual(currentExportData, mapToObj(exportData)); + t.deepEqual(artifactNames, []); + // and it should be importable + const db2 = await reImport(t, dbDir, 'operational'); + t.is(db2.prepare('SELECT COUNT(*) FROM snapshots').pluck().get(), 2); + const db3 = await reImport(t, dbDir, 'archival'); + t.is(db3.prepare('SELECT COUNT(*) FROM snapshots').pluck().get(), 2); + } + + // second+last deletion, enough budget to delete both remaining + // snapshots + { + const dc = snapStore.deleteVatSnapshots(vatID, 5); + t.true(dc.done); + t.is(dc.cleanups, 2); + await commit(); + t.deepEqual(currentExportData, {}); + t.is(db.prepare('SELECT COUNT(*) FROM snapshots').pluck().get(), 0); + // export should mention nothing + const { exportData, artifactNames } = await getExport(dbDir, 'operational'); + t.deepEqual(currentExportData, mapToObj(exportData)); + t.deepEqual(artifactNames, []); + } +}); + +test('slow deletion without stopUsingLastSnapshot', async t => { + // slow snapshot deletion should work even without + // stopUsingLastSnapshot + const { dbDir, commit, snapStore, currentExportData, vatID, hash } = + await setupSnapshots(t); + + { + // budget=1 will let it delete one snapshot, the last one. Because + // we didn't call stopUsingLastSnapshot, this also removes the + // .current record + const dc = snapStore.deleteVatSnapshots(vatID, 1); + t.false(dc.done); + t.is(dc.cleanups, 1); + await commit(); + t.deepEqual(currentExportData, { + 'snapshot.v1.10': JSON.stringify({ vatID, snapPos: 10, hash, inUse: 0 }), + 'snapshot.v1.11': JSON.stringify({ vatID, snapPos: 11, hash, inUse: 0 }), + }); + const { exportData, artifactNames } = await getExport(dbDir, 'operational'); + t.deepEqual(mapToObj(exportData), currentExportData); + t.deepEqual(artifactNames, []); + } + + { + const dc = snapStore.deleteVatSnapshots(vatID, 1); + t.false(dc.done); + t.is(dc.cleanups, 1); + await commit(); + t.deepEqual(currentExportData, { + 'snapshot.v1.10': JSON.stringify({ vatID, snapPos: 10, hash, inUse: 0 }), + }); + const { exportData, artifactNames } = await getExport(dbDir, 'operational'); + t.deepEqual(mapToObj(exportData), currentExportData); + t.deepEqual(artifactNames, []); + } + + { + const dc = snapStore.deleteVatSnapshots(vatID, 1); + t.true(dc.done); + t.is(dc.cleanups, 1); + await commit(); + t.deepEqual(currentExportData, {}); + const { exportData, artifactNames } = await getExport(dbDir, 'operational'); + t.deepEqual(mapToObj(exportData), currentExportData); + t.deepEqual(artifactNames, []); + } +}); + +test('full deletion without stopUsingLastSnapshot', async t => { + // full snapshot deletion should work even without + // stopUsingLastSnapshot + const { dbDir, commit, snapStore, currentExportData, vatID } = + await setupSnapshots(t); + + { + const dc = snapStore.deleteVatSnapshots(vatID); + t.true(dc.done); + // no budget means no accounting, ignore dc.cleanups + await commit(); + t.deepEqual(currentExportData, {}); + const { exportData, artifactNames } = await getExport(dbDir, 'operational'); + t.deepEqual(mapToObj(exportData), currentExportData); + t.deepEqual(artifactNames, []); + } +});