From 65938e5ae756270e31e9b11b87a445e08774f8a2 Mon Sep 17 00:00:00 2001 From: FluxCapacitor2 <31071265+FluxCapacitor2@users.noreply.github.com> Date: Wed, 7 Aug 2024 21:41:42 -0400 Subject: [PATCH] Move DB setup logic to its own file, simplify AddDocument, and use triggers to update FTS virtual table --- app/crawler/crawler.go | 6 +- app/database/db.go | 2 +- app/database/db_sqlite.go | 150 +++---------------------------- app/database/db_sqlite_setup.sql | 69 ++++++++++++++ app/easysearch.go | 5 +- 5 files changed, 89 insertions(+), 143 deletions(-) create mode 100644 app/database/db_sqlite_setup.sql diff --git a/app/crawler/crawler.go b/app/crawler/crawler.go index 76c5695..46138c4 100644 --- a/app/crawler/crawler.go +++ b/app/crawler/crawler.go @@ -109,14 +109,12 @@ func Crawl(source config.Source, currentDepth int32, db database.Database, pageU for _, item := range element.DOM.Nodes { content += getText(item) } - _, err = db.AddDocument(source.ID, currentDepth, canonical, database.Finished, title, description, content) + err = db.AddDocument(source.ID, currentDepth, canonical, database.Finished, title, description, content) } else { - if len(title) == 0 { title = article.Title } - - _, err = db.AddDocument(source.ID, currentDepth, canonical, database.Finished, title, description, article.TextContent) + err = db.AddDocument(source.ID, currentDepth, canonical, database.Finished, title, description, article.TextContent) } if err != nil { diff --git a/app/database/db.go b/app/database/db.go index bda492f..dd4390b 100644 --- a/app/database/db.go +++ b/app/database/db.go @@ -5,7 +5,7 @@ type Database interface { Setup() error // Add a page to the search index. - AddDocument(source string, depth int32, url string, status QueueItemStatus, title string, description string, content string) (*Page, error) + AddDocument(source string, depth int32, url string, status QueueItemStatus, title string, description string, content string) error // Returns whether the given URL (or the URL's canonical) is indexed HasDocument(source string, url string) (*bool, error) diff --git a/app/database/db_sqlite.go b/app/database/db_sqlite.go index f9e01e7..004ad9d 100644 --- a/app/database/db_sqlite.go +++ b/app/database/db_sqlite.go @@ -6,6 +6,8 @@ import ( "regexp" "strings" + _ "embed" + "github.com/google/uuid" _ "github.com/mattn/go-sqlite3" ) @@ -17,146 +19,22 @@ type SQLiteDatabase struct { conn *sql.DB } -func (db *SQLiteDatabase) Setup() error { - { - // Enable write-ahead logging for improved write performance (https://www.sqlite.org/wal.html) - _, err := db.conn.Exec("PRAGMA journal_mode=WAL;") - - if err != nil { - return err - } - } - - // TODO: use a composite index on the `source` AND `url` column instead of making `url` globally unique - - { - _, err := db.conn.Exec(` - CREATE VIRTUAL TABLE IF NOT EXISTS pages_fts USING fts5 ( - source UNINDEXED, - url, - title, - description, - content, - status UNINDEXED - ) - `) - - if err != nil { - return err - } - } - - { - _, err := db.conn.Exec(` - CREATE TABLE IF NOT EXISTS pages ( - source TEXT NOT NULL, - url TEXT NOT NULL UNIQUE, - crawledAt DATETIME DEFAULT CURRENT_TIMESTAMP, - depth INTEGER NOT NULL, - status INTEGER NOT NULL - ) - `) - - if err != nil { +//go:embed db_sqlite_setup.sql +var setupCommands string - return err - } - } - - { - _, err := db.conn.Exec(` - CREATE TABLE IF NOT EXISTS crawl_queue ( - source TEXT NOT NULL, - url TEXT NOT NULL UNIQUE, - status INTEGER DEFAULT 0, - depth INTEGER, - addedAt DATETIME DEFAULT CURRENT_TIMESTAMP, - updatedAt DATETIME DEFAULT CURRENT_TIMESTAMP - ) - `) - - if err != nil { - return err - } - } - - { - _, err := db.conn.Exec(` - CREATE TABLE IF NOT EXISTS canonicals ( - source TEXT NOT NULL, - url TEXT NOT NULL UNIQUE, - canonical TEXT NOT NULL, - crawledAt DATETIME DEFAULT CURRENT_TIMESTAMP - ) - `) - - if err != nil { - return err - } - } - - return nil +func (db *SQLiteDatabase) Setup() error { + _, err := db.conn.Exec(setupCommands) + return err } -func (db *SQLiteDatabase) AddDocument(source string, depth int32, url string, status QueueItemStatus, title string, description string, content string) (*Page, error) { - tx, err := db.conn.Begin() - - if err != nil { - fmt.Printf("Error starting transaction: %v\n", err) - return nil, err - } - - // Remove old entries - { - _, err := tx.Exec(` - DELETE FROM crawl_queue WHERE source = ? AND url = ?; - DELETE FROM pages WHERE source = ? AND url = ?; - DELETE FROM pages_fts WHERE source = ? AND url = ?; - `, source, url, source, url, source, url) - - if err != nil { - fmt.Printf("Error removing old entries: %v\n", err) - return nil, err - } - } - - // Insert new records for the page (to prevent duplicates and record crawl time as a DATETIME) and the FTS entry (for user search queries) - { - _, err := tx.Exec("INSERT INTO pages (source, url, depth, status) VALUES (?, ?, ?, ?)", source, url, depth, status) - if err != nil { - fmt.Printf("Error inserting new page: %v\n", err) - return nil, err - } - } - - result := tx.QueryRow("INSERT INTO pages_fts (source, url, title, description, content, status) VALUES (?, ?, ?, ?, ?, ?) RETURNING *", source, url, title, description, content, status) - - // Return the newly-inserted row - row := &Page{} - - { - err := result.Scan(&row.Source, &row.URL, &row.Title, &row.Description, &row.Content, &row.Status) - - if err != nil { - fmt.Printf("Error scanning inserted row: %v\n", err) - return nil, err - } - } - - { - err := tx.Commit() - if err != nil { - fmt.Printf("Error inserting new FTS entry: %v\n", err) - return nil, err - } - } - - return row, nil +func (db *SQLiteDatabase) AddDocument(source string, depth int32, url string, status QueueItemStatus, title string, description string, content string) error { + _, err := db.conn.Exec("INSERT INTO pages (source, depth, status, url, title, description, content) VALUES (?, ?, ?, ?, ?, ?, ?);", source, depth, status, url, title, description, content) + return err } func (db *SQLiteDatabase) HasDocument(source string, url string) (*bool, error) { // TODO: SELECTing the URL is unnecessary. we can just use a "SELECT 1" and see if any rows were returned. - cursor := db.conn.QueryRow("SELECT url FROM pages WHERE source = ? AND (url = ? OR url IN (SELECT canonical FROM canonicals WHERE url = ?))", source, url, url) + cursor := db.conn.QueryRow("SELECT url FROM pages WHERE source = ? AND (url = ? OR url IN (SELECT canonical FROM canonicals WHERE url = ?));", source, url, url) page := &Page{} err := cursor.Scan(&page.URL) @@ -205,9 +83,9 @@ func (db *SQLiteDatabase) Search(sources []string, search string, page uint32, p query := fmt.Sprintf(`SELECT rank, url, - highlight(pages_fts, 2, ?, ?) title, - snippet(pages_fts, 3, ?, ?, '…', 8) description, - snippet(pages_fts, 4, ?, ?, '…', 24) content + highlight(pages_fts, 3, ?, ?) title, + snippet(pages_fts, 4, ?, ?, '…', 8) description, + snippet(pages_fts, 5, ?, ?, '…', 24) content FROM pages_fts WHERE source IN (%s) AND status = ? AND pages_fts MATCH ? ORDER BY rank LIMIT ? OFFSET ?; `, strings.Repeat("?, ", len(sources)-1)+"?") diff --git a/app/database/db_sqlite_setup.sql b/app/database/db_sqlite_setup.sql new file mode 100644 index 0000000..ded153d --- /dev/null +++ b/app/database/db_sqlite_setup.sql @@ -0,0 +1,69 @@ +-- Enable write-ahead logging for improved write performance (https://www.sqlite.org/wal.html) +PRAGMA journal_mode = wal; + +CREATE TABLE IF NOT EXISTS crawl_queue( + source TEXT NOT NULL, + url TEXT NOT NULL UNIQUE, + status INTEGER DEFAULT 0, -- Pending + depth INTEGER, + addedAt DATETIME DEFAULT CURRENT_TIMESTAMP, + updatedAt DATETIME DEFAULT CURRENT_TIMESTAMP +); + +-- When a canonical URL is discovered, it is cached in this table to prevent excessively querying the target +CREATE TABLE IF NOT EXISTS canonicals( + source TEXT NOT NULL, + url TEXT NOT NULL UNIQUE, + canonical TEXT NOT NULL, + crawledAt DATETIME DEFAULT CURRENT_TIMESTAMP +); + +-- After a page is crawled, it is added to this table +CREATE TABLE IF NOT EXISTS pages( + source TEXT NOT NULL, + + crawledAt DATETIME DEFAULT CURRENT_TIMESTAMP, + depth INTEGER NOT NULL, + status INTEGER NOT NULL, + + url TEXT NOT NULL, + title TEXT, + description TEXT, + content TEXT +); + +-- Ensure a page can only be added once per source +CREATE UNIQUE INDEX IF NOT EXISTS page_source_url ON pages(source, url); + +-- Create a full-text search table +CREATE VIRTUAL TABLE IF NOT EXISTS pages_fts USING fts5( + source UNINDEXED, + status UNINDEXED, + + url, + title, + description, + content, + + -- Specify that this FTS table is contentless and gets its content from the `pages` table + content=pages +); + +-- Use triggers to automatically sync the FTS table with the content table +-- https://sqlite.org/fts5.html#external_content_tables +CREATE TRIGGER IF NOT EXISTS pages_auto_insert AFTER INSERT ON pages BEGIN + INSERT INTO pages_fts(rowid, source, status, url, title, description, content) VALUES (new.rowid, new.source, new.status, new.url, new.title, new.description, new.content); + -- Remove crawl queue entry if it exists + DELETE FROM crawl_queue WHERE source = new.source AND url = new.url; +END; + +CREATE TRIGGER IF NOT EXISTS pages_auto_delete AFTER DELETE ON pages BEGIN + INSERT INTO pages_fts(pages_fts, source, status, rowid, url, title, description, content) VALUES('delete', old.rowid, old.source, old.status, old.url, old.title, old.description, old.content); +END; + +CREATE TRIGGER IF NOT EXISTS pages_auto_update AFTER UPDATE ON pages BEGIN + INSERT INTO pages_fts(pages_fts, source, status, rowid, url, title, description, content) VALUES('delete', old.rowid, old.source, old.status, old.url, old.title, old.description, old.content); + INSERT INTO pages_fts(rowid, url, title, description, content) VALUES (new.url, new.title, new.description, new.content); + -- Remove crawl queue entry if it exists + DELETE FROM crawl_queue WHERE source = new.source AND url = new.url; +END; diff --git a/app/easysearch.go b/app/easysearch.go index c307a05..18193f3 100644 --- a/app/easysearch.go +++ b/app/easysearch.go @@ -14,6 +14,7 @@ import ( ) // TODO: look into dependency injection instead of passing the DB and config into every function call +// TODO: add a command-line option to rebuild the search index (https://sqlite.org/fts5.html#the_rebuild_command) func main() { @@ -74,7 +75,7 @@ func startCrawl(db database.Database, config *config.Config) { exists, err := db.HasDocument(src.ID, src.URL) if err != nil { - fmt.Printf("Failed to look up document %v in pages table\n", err) + fmt.Printf("Failed to look up document '%v'/'%v' in pages table: %v\n", src.ID, src.URL, err) } else { if !*exists { // If the document wasn't found, it should be added to the queue @@ -136,7 +137,7 @@ func consumeQueue(db database.Database, config *config.Config) { // Add an entry to the pages table to prevent immediately recrawling the same URL when referred from other sources if result != nil { - _, err := db.AddDocument(src.ID, item.Depth, result.Canonical, database.Error, "", "", "") + err := db.AddDocument(src.ID, item.Depth, result.Canonical, database.Error, "", "", "") if err != nil { fmt.Printf("Failed to add page in 'error' state: %v\n", err) }