From 8c8ce27df085ac2b90704266cbbfd85c3a1c17b9 Mon Sep 17 00:00:00 2001 From: FluxCapacitor2 <31071265+FluxCapacitor2@users.noreply.github.com> Date: Wed, 8 Jan 2025 12:20:39 -0500 Subject: [PATCH] Replace SELECT + REPLACE INTO with one INSERT with an UPSERT clause This really shows you that fresh eyes are important! I thought I checked my triggers yesterday, but today, I had the idea to run the setup SQL in the `sqlite3` REPL and remove various parts until it worked. It turns out one of my triggers was broken, but I didn't catch it earlier because REPLACE INTO deletes and re-creates the entire row instead of updating it. The "4 values for 5 columns" error was indeed the problem, I just didn't look closely enough. --- app/database/db_sqlite.go | 28 ++++++---------------------- app/database/db_sqlite_setup.sql | 2 +- 2 files changed, 7 insertions(+), 23 deletions(-) diff --git a/app/database/db_sqlite.go b/app/database/db_sqlite.go index d1fb82e..33667d4 100644 --- a/app/database/db_sqlite.go +++ b/app/database/db_sqlite.go @@ -60,16 +60,12 @@ func (db *SQLiteDatabase) AddDocument(ctx context.Context, source string, depth return id, err } - oldID := int64(-1) - err = tx.QueryRowContext(ctx, "SELECT id FROM pages WHERE source = ? AND url = ?;", source, url).Scan(&oldID) - if err != nil && err != sql.ErrNoRows { - if err := tx.Rollback(); err != nil { - return id, err - } - return id, err - } - - err = tx.QueryRowContext(ctx, "REPLACE INTO pages (source, depth, status, url, title, description, content, errorInfo) VALUES (?, ?, ?, ?, ?, ?, ?, ?) RETURNING (id);", source, depth, status, url, title, description, content, errorInfo).Scan(&id) + err = tx.QueryRowContext(ctx, ` + INSERT INTO pages (source, depth, status, url, title, description, content, errorInfo) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + ON CONFLICT DO UPDATE SET depth = min(depth, excluded.depth), status = excluded.status, title = excluded.title, description = excluded.description, content = excluded.content, errorInfo = excluded.errorInfo, crawledAt = CURRENT_TIMESTAMP + RETURNING id; + `, source, depth, status, url, title, description, content, errorInfo).Scan(&id) if err != nil { if err := tx.Rollback(); err != nil { return id, err @@ -78,18 +74,6 @@ func (db *SQLiteDatabase) AddDocument(ctx context.Context, source string, depth } for _, ref := range referrers { - if oldID != -1 && id != oldID && ref == oldID { - // Special case: within the transaction, if the REPLACE INTO replaces an existing row, - // it's possible that one of the referrers now points to a page that no longer exists. - // If this happens, we should just ignore it because pages shouldn't point to themselves anyways. - - // 2025/01/08: I tried for ~3 hours to just use a normal UPSERT-style statement to avoid the extra SELECT: - // "INSERT INTO pages ... ON CONFLICT DO UPDATE SET ...", but I kept getting errors like "4 values for 5 columns" - // and "sub-select returns 4 columns - expected 1" even though I'm not doing any sub-selects and every sub-select - // in a trigger uses the correct number of columns (and even for regular UPDATE statements too!). - // I've decided to give up for now. :/ - continue - } if ref == id { // Pages don't need to reference themselves continue diff --git a/app/database/db_sqlite_setup.sql b/app/database/db_sqlite_setup.sql index 9a05b43..c4aedb2 100644 --- a/app/database/db_sqlite_setup.sql +++ b/app/database/db_sqlite_setup.sql @@ -119,7 +119,7 @@ END; CREATE TRIGGER IF NOT EXISTS pages_auto_update AFTER UPDATE ON pages BEGIN INSERT INTO pages_fts(pages_fts, rowid, url, title, description, content) VALUES('delete', old.rowid, old.url, old.title, old.description, old.content); - INSERT INTO pages_fts(rowid, url, title, description, content) VALUES (new.url, new.title, new.description, new.content); + INSERT INTO pages_fts(rowid, url, title, description, content) VALUES (new.rowid, new.url, new.title, new.description, new.content); -- Remove crawl queue entry if it exists DELETE FROM crawl_queue WHERE source = new.source AND url = new.url; END;