-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Move DB setup logic to its own file, simplify AddDocument, and use tr…
…iggers to update FTS virtual table
- Loading branch information
1 parent
3eca33b
commit 65938e5
Showing
5 changed files
with
89 additions
and
143 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
-- Enable write-ahead logging for improved write performance (https://www.sqlite.org/wal.html) | ||
PRAGMA journal_mode = wal; | ||
|
||
CREATE TABLE IF NOT EXISTS crawl_queue( | ||
source TEXT NOT NULL, | ||
url TEXT NOT NULL UNIQUE, | ||
status INTEGER DEFAULT 0, -- Pending | ||
depth INTEGER, | ||
addedAt DATETIME DEFAULT CURRENT_TIMESTAMP, | ||
updatedAt DATETIME DEFAULT CURRENT_TIMESTAMP | ||
); | ||
|
||
-- When a canonical URL is discovered, it is cached in this table to prevent excessively querying the target | ||
CREATE TABLE IF NOT EXISTS canonicals( | ||
source TEXT NOT NULL, | ||
url TEXT NOT NULL UNIQUE, | ||
canonical TEXT NOT NULL, | ||
crawledAt DATETIME DEFAULT CURRENT_TIMESTAMP | ||
); | ||
|
||
-- After a page is crawled, it is added to this table | ||
CREATE TABLE IF NOT EXISTS pages( | ||
source TEXT NOT NULL, | ||
|
||
crawledAt DATETIME DEFAULT CURRENT_TIMESTAMP, | ||
depth INTEGER NOT NULL, | ||
status INTEGER NOT NULL, | ||
|
||
url TEXT NOT NULL, | ||
title TEXT, | ||
description TEXT, | ||
content TEXT | ||
); | ||
|
||
-- Ensure a page can only be added once per source | ||
CREATE UNIQUE INDEX IF NOT EXISTS page_source_url ON pages(source, url); | ||
|
||
-- Create a full-text search table | ||
CREATE VIRTUAL TABLE IF NOT EXISTS pages_fts USING fts5( | ||
source UNINDEXED, | ||
status UNINDEXED, | ||
|
||
url, | ||
title, | ||
description, | ||
content, | ||
|
||
-- Specify that this FTS table is contentless and gets its content from the `pages` table | ||
content=pages | ||
); | ||
|
||
-- Use triggers to automatically sync the FTS table with the content table | ||
-- https://sqlite.org/fts5.html#external_content_tables | ||
CREATE TRIGGER IF NOT EXISTS pages_auto_insert AFTER INSERT ON pages BEGIN | ||
INSERT INTO pages_fts(rowid, source, status, url, title, description, content) VALUES (new.rowid, new.source, new.status, new.url, new.title, new.description, new.content); | ||
-- Remove crawl queue entry if it exists | ||
DELETE FROM crawl_queue WHERE source = new.source AND url = new.url; | ||
END; | ||
|
||
CREATE TRIGGER IF NOT EXISTS pages_auto_delete AFTER DELETE ON pages BEGIN | ||
INSERT INTO pages_fts(pages_fts, source, status, rowid, url, title, description, content) VALUES('delete', old.rowid, old.source, old.status, old.url, old.title, old.description, old.content); | ||
END; | ||
|
||
CREATE TRIGGER IF NOT EXISTS pages_auto_update AFTER UPDATE ON pages BEGIN | ||
INSERT INTO pages_fts(pages_fts, source, status, rowid, url, title, description, content) VALUES('delete', old.rowid, old.source, old.status, old.url, old.title, old.description, old.content); | ||
INSERT INTO pages_fts(rowid, url, title, description, content) VALUES (new.url, new.title, new.description, new.content); | ||
-- Remove crawl queue entry if it exists | ||
DELETE FROM crawl_queue WHERE source = new.source AND url = new.url; | ||
END; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters