Merge pull request #728 from EspressoSystems/ab/sqlite-support

ab/sqlite-support
EspressoSystems · Nov 22, 2024 · 970d3c3 · 970d3c3
2 parents 7074bee + 5ffd7b7
commit 970d3c3
Show file tree

Hide file tree

Showing 23 changed files with 762 additions and 376 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -8,18 +8,16 @@ on:
   pull_request:
   workflow_dispatch:
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
   build:
     runs-on: ubuntu-latest
     env:
       RUST_LOG: info
     steps:
-      - uses: styfle/cancel-workflow-action@0.12.1
-        name: Cancel Outdated Builds
-        with:
-          all_but_latest: true
-          access_token: ${{ github.token }}
-
       - uses: actions/checkout@v4
         name: Checkout Repository
 
@@ -36,21 +34,12 @@ jobs:
 
       # Run Clippy on all targets. The lint workflow doesn't run Clippy on tests, because the tests
       # don't compile with all combinations of features.
-      - name: Clippy
+      - name: Clippy(all-features)
         run: cargo clippy --workspace --all-features --all-targets -- -D warnings
 
-      # Install nextest
-      - name: Install Nextest
-        run: cargo install cargo-nextest
-
-      - name: Test
-        run: |
-          cargo nextest run --workspace --release --all-features
-        timeout-minutes: 60
+      - name: Clippy(no-storage)
+        run: cargo clippy --workspace --features no-storage --all-targets -- -D warnings
 
-      - name: Doc Test
-        run: cargo test --release --all-features --doc
-
       - name: Generate Documentation
         run: |
           cargo doc --no-deps --lib --release --all-features
@@ -63,3 +52,42 @@ jobs:
           github_token: ${{ secrets.GITHUB_TOKEN }}
           publish_dir: ./target/doc
           cname: tide-disco.docs.espressosys.com
+  test-sqlite:
+    runs-on: ubuntu-latest
+    env:
+      RUST_LOG: info
+    steps:
+      - uses: actions/checkout@v4
+        name: Checkout Repository
+
+      - uses: Swatinem/rust-cache@v2
+        name: Enable Rust Caching
+
+        # Install nextest
+      - name: Install Nextest
+        run: cargo install cargo-nextest
+
+      - name: Test
+        run: |
+          cargo nextest run --workspace --release --all-features
+        timeout-minutes: 60
+
+  test-postgres:
+    runs-on: ubuntu-latest
+    env:
+      RUST_LOG: info
+    steps:
+      - uses: actions/checkout@v4
+        name: Checkout Repository
+
+      - uses: Swatinem/rust-cache@v2
+        name: Enable Rust Caching
+
+      # Install nextest
+      - name: Install Nextest
+        run: cargo install cargo-nextest
+
+      - name: Test
+        run: |
+          cargo nextest run --workspace --release --features "no-storage, testing"
+        timeout-minutes: 60
diff --git a/.gitignore b/.gitignore
@@ -8,4 +8,7 @@ lcov.info
 
 /vsc
 
-/.vscode
+/.vscode
+
+# for sqlite databases created during the tests
+/tmp
diff --git a/Cargo.toml b/Cargo.toml
@@ -20,6 +20,11 @@ license = "GPL-3.0-or-later"
 [features]
 default = ["file-system-data-source", "metrics-data-source", "sql-data-source"]
 
+# Enables support for an embedded SQLite database instead of PostgreSQL.
+# Ideal for lightweight nodes that benefit from pruning and merklized state storage,
+# offering advantages over file system storage.
+embedded-db = []
+
 # Enable the availability data source backed by the local file system.
 file-system-data-source = ["atomic_store"]
 

diff --git a/examples/simple-server.rs b/examples/simple-server.rs
@@ -84,15 +84,20 @@ async fn init_db() -> Db {
 }
 
 #[cfg(not(target_os = "windows"))]
-async fn init_data_source(db: &Db) -> DataSource {
-    data_source::sql::Config::default()
-        .user("postgres")
-        .password("password")
-        .host(db.host())
-        .port(db.port())
-        .connect(Default::default())
-        .await
-        .unwrap()
+async fn init_data_source(#[allow(unused_variables)] db: &Db) -> DataSource {
+    let mut cfg = data_source::sql::Config::default();
+
+    #[cfg(not(feature = "embedded-db"))]
+    {
+        cfg = cfg.host(db.host()).port(db.port());
+    }
+
+    #[cfg(feature = "embedded-db")]
+    {
+        cfg = cfg.db_path(db.path());
+    }
+
+    cfg.connect(Default::default()).await.unwrap()
 }
 
 #[cfg(target_os = "windows")]

diff --git a/migrations/V100__drop_leaf_payload.sql → ...ions/postgres/V100__drop_leaf_payload.sql b/migrations/V100__drop_leaf_payload.sql → ...ions/postgres/V100__drop_leaf_payload.sql
diff --git a/migrations/V10__init_schema.sql → migrations/postgres/V10__init_schema.sql b/migrations/V10__init_schema.sql → migrations/postgres/V10__init_schema.sql
diff --git a/migrations/V200__create_aggregates_table.sql → ...ostgres/V200__create_aggregates_table.sql b/migrations/V200__create_aggregates_table.sql → ...ostgres/V200__create_aggregates_table.sql
diff --git a/migrations/V20__payload_hash_index.sql → ...ions/postgres/V20__payload_hash_index.sql b/migrations/V20__payload_hash_index.sql → ...ions/postgres/V20__payload_hash_index.sql
diff --git a/migrations/V300__transactions_count.sql → ...ons/postgres/V300__transactions_count.sql b/migrations/V300__transactions_count.sql → ...ons/postgres/V300__transactions_count.sql
diff --git a/..._drop_leaf_block_hash_fkey_constraint.sql → ..._drop_leaf_block_hash_fkey_constraint.sql b/..._drop_leaf_block_hash_fkey_constraint.sql → ..._drop_leaf_block_hash_fkey_constraint.sql
diff --git a/migrations/postgres/V400__rename_transaction_table.sql b/migrations/postgres/V400__rename_transaction_table.sql
@@ -0,0 +1,5 @@
+ALTER TABLE transaction
+  RENAME TO transactions;
+
+ALTER TABLE transactions
+  RENAME COLUMN index TO idx;
diff --git a/migrations/sqlite/V100__init_schema.sql b/migrations/sqlite/V100__init_schema.sql
@@ -0,0 +1,76 @@
+CREATE TABLE header
+(
+    height    BIGINT  PRIMARY KEY,
+    hash      TEXT NOT NULL UNIQUE,
+    payload_hash TEXT NOT NULL,
+    timestamp BIGINT NOT NULL,
+
+    -- For convenience, we store the entire application-specific header type as JSON. Just like
+    -- `leaf.leaf` and `leaf.qc`, this allows us to easily reconstruct the entire header using
+    -- `serde_json`, and to run queries and create indexes on application-specific header fields
+    -- without having a specific column for those fields. In many cases, this will enable new
+    -- application-specific API endpoints to be implemented without altering the schema (beyond
+    -- possibly adding an index for performance reasons).
+    data JSONB NOT NULL
+);
+
+CREATE INDEX header_timestamp_idx ON header (timestamp);
+
+CREATE TABLE payload
+(
+    height BIGINT PRIMARY KEY REFERENCES header (height) ON DELETE CASCADE,
+    size   INTEGER,
+    data   BLOB,
+    num_transactions INTEGER
+);
+
+CREATE TABLE vid
+(
+    height BIGINT PRIMARY KEY REFERENCES header (height) ON DELETE CASCADE,
+    common BLOB  NOT NULL,
+    share  BLOB
+);
+
+CREATE TABLE leaf
+(
+    height     BIGINT  PRIMARY KEY REFERENCES header (height) ON DELETE CASCADE,
+    hash       TEXT NOT NULL UNIQUE,
+    block_hash TEXT NOT NULL,
+
+    -- For convenience, we store the entire leaf and justifying QC as JSON blobs. There is a bit of
+    -- redundancy here with the indexed fields above, but it makes it easy to reconstruct the entire
+    -- leaf without depending on the specific fields of the application-specific leaf type. We
+    -- choose JSON over a binary format, even though it has a larger storage footprint, because
+    -- Postgres actually has decent JSON support: we don't have to worry about escaping non-ASCII
+    -- characters in inputs, and we can even do queries on the JSON and add indices over sub-objects
+    -- of the JSON blobs.
+    leaf JSONB NOT NULL,
+    qc   JSONB NOT NULL
+);
+
+CREATE TABLE transactions
+(
+    hash TEXT NOT NULL,
+    -- Block containing this transaction.
+    block_height BIGINT NOT NULL REFERENCES header(height) ON DELETE CASCADE,
+    -- Position within the block. Transaction indices are an application-specific type, so we store
+    -- it as a serialized blob. We use JSON instead of a binary format so that the application can
+    -- make use of the transaction index in its own SQL queries.
+    idx JSONB NOT NULL,
+    PRIMARY KEY (block_height, idx)
+);
+-- This index is not unique, because nothing stops HotShot from sequencing duplicate transactions.
+CREATE INDEX transaction_hash ON transactions (hash);
+
+CREATE TABLE pruned_height (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    -- The height of the last pruned block.
+    last_height BIGINT NOT NULL
+);
+
+CREATE TABLE last_merklized_state_height (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    height BIGINT NOT NULL
+);
+
+CREATE INDEX header_payload_hash_idx ON header (payload_hash);
diff --git a/migrations/sqlite/V200__create_aggregates_table.sql b/migrations/sqlite/V200__create_aggregates_table.sql
@@ -0,0 +1,5 @@
+CREATE TABLE aggregate (
+    height BIGINT PRIMARY KEY REFERENCES header (height) ON DELETE CASCADE,
+    num_transactions BIGINT NOT NULL,
+    payload_size BIGINT NOT NULL
+);
diff --git a/src/data_source/sql.rs b/src/data_source/sql.rs
@@ -26,10 +26,12 @@ pub use anyhow::Error;
 use hotshot_types::traits::node_implementation::NodeType;
 pub use refinery::Migration;
 
-pub use sql::{Config, Transaction};
+pub use sql::Transaction;
 
 pub type Builder<Types, Provider> = fetching::Builder<Types, SqlStorage, Provider>;
 
+pub type Config = sql::Config;
+
 impl Config {
     /// Connect to the database with this config.
     pub async fn connect<Types, P: AvailabilityProvider<Types>>(
@@ -78,9 +80,11 @@ impl Config {
 ///
 /// ## Initialization
 ///
-/// When creating a [`SqlDataSource`], the caller can use [`Config`] to specify the host, user, and
-/// database to connect to. As such, [`SqlDataSource`] is not very opinionated about how the
-/// Postgres instance is set up. The administrator must simply ensure that there is a database
+/// When creating a PostgreSQL [`SqlDataSource`], the caller can use [`Config`] to specify the host, user, and
+/// database for the connection. If the `embedded-db` feature is enabled, the caller can instead specify the
+/// file path for an SQLite database.
+/// As such, [`SqlDataSource`] is not very opinionated about how the
+/// database instance is set up. The administrator must simply ensure that there is a database
 /// dedicated to the [`SqlDataSource`] and a user with appropriate permissions (all on `SCHEMA` and
 /// all on `DATABASE`) over that database.
 ///
@@ -96,18 +100,29 @@ impl Config {
 /// GRANT ALL ON DATABASE hotshot_query_service TO hotshot_user WITH GRANT OPTION;
 /// ```
 ///
-/// One could then connect to this database with the following [`Config`]:
+/// For SQLite, simply provide the file path, and the file will be created if it does not already exist.
+///
+/// One could then connect to this database with the following [`Config`] for postgres:
 ///
 /// ```
 /// # use hotshot_query_service::data_source::sql::Config;
+/// #[cfg(not(feature= "embedded-db"))]
 /// Config::default()
 ///     .host("postgres.database.hostname")
 ///     .database("hotshot_query_service")
 ///     .user("hotshot_user")
 ///     .password("password")
 /// # ;
 /// ```
+/// Or, if the `embedded-db` feature is enabled, configure it as follows for SQLite:
 ///
+/// ```
+/// # use hotshot_query_service::data_source::sql::Config;
+/// #[cfg(feature= "embedded-db")]
+/// Config::default()
+///     .db_path("temp.db".into())
+/// # ;
+/// ```
 /// ## Resetting
 ///
 /// In general, resetting the database when necessary is left up to the administrator. However, for